In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly import express as px

import ast
import pickle
from tqdm import tqdm

import networkx as nx
import random
import json

## Load in Data

### Load in Tweet Data

In [82]:
df = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/bigbabies/bigbaby_r_t_30.csv')

In [100]:
# evaluating strings of lists as lists
df['hashtags'] = df['hashtags'].apply(ast.literal_eval)
df['mentions'] = df['mentions'].apply(ast.literal_eval)
# df['keywords'] = df['keywords'].apply(ast.literal_eval)

In [109]:
df.columns[:24]

Index(['created_at', 'tweet_id', 'text', 'in_reply_to_tweet_id',
       'in_reply_to_user_id', 'geo', 'quote_count', 'reply_count',
       'retweet_count', 'favorite_count', 'lang', 'quoted_tweet_id',
       'possibly_sensitive', 'user_id', 'screen_name', 'followers_count',
       'friends_count', 'statuses_count', 'verified', 'hashtags', 'mentions',
       'date', '0', '1'],
      dtype='object')

### Load in Crime Data

Load in the hate crime `hatecrimedata.csv` dataframe.

In [116]:
df_crime = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/hatecrimedata.csv')

In [117]:
# limiting to only dates that have associated tweets
df_crime['incident_date'] = pd.to_datetime(df_crime['incident_date'])
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2021-06-17')

df_crime = df_crime[(df_crime['incident_date'] >= start_date) & (df_crime['incident_date'] <= end_date)]
df_crime['incident_date'] = pd.to_datetime(df_crime['incident_date']).dt.strftime('%Y-%m-%d')

In [118]:
df_crime.columns

Index(['incident_id', 'data_year', 'ori', 'pug_agency_name', 'pub_agency_unit',
       'agency_type_name', 'state_abbr', 'state_name', 'division_name',
       'region_name', 'population_group_code', 'population_group_description',
       'incident_date', 'adult_victim_count', 'juvenile_victim_count',
       'total_offender_count', 'adult_offender_count',
       'juvenile_offender_count', 'offender_race', 'offender_ethnicity',
       'victim_count', 'offense_name', 'total_individual_victims',
       'location_name', 'bias_desc', 'victim_types', 'multiple_offense',
       'multiple_bias'],
      dtype='object')

In [119]:
df_crime = df_crime[df_crime['state_name'] == 'California']

In [120]:
len(df_crime)

1192

In [22]:
df_crime['bias_desc'].unique()

array(['Anti-Jewish', 'Anti-Gay (Male)', 'Anti-Black or African American',
       'Anti-Hindu', 'Anti-Transgender', 'Anti-Hispanic or Latino',
       'Anti-Multiple Races, Group', 'Anti-Asian',
       'Anti-Mental Disability', 'Anti-White',
       'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Female',
       'Anti-Other Religion',
       'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)',
       'Anti-Islamic (Muslim)', 'Anti-Arab', 'Anti-Lesbian (Female)',
       'Anti-Other Christian', 'Anti-Physical Disability',
       'Anti-Multiple Religions, Group', 'Anti-Sikh',
       'Anti-Gender Non-Conforming',
       'Anti-American Indian or Alaska Native', 'Anti-Bisexual',
       'Anti-Protestant', 'Anti-Eastern Orthodox (Russian, Greek, Other)',
       'Anti-Buddhist', 'Anti-Catholic', 'Anti-Church of Jesus Christ',
       'Anti-Black or African American;Anti-Hispanic or Latino',
       'Anti-Gay (Male);Anti-Hispanic or Latino',
       'Anti-Asian;Anti-Black or African American'], d

To connect our tweets with our hate crimes, we define a set of `groups` which are strings attached to a `'g'` tag. These are associated with certain hand-picked tweet topics in the `topic2group` dictionary.

In [56]:
groups = {('black', 'g')}#, ('lesbian', 'g'), ('lgbt', 'g'), ('hispanic', 'g'), ('asian', 'g')}

# topic2group = {\
# '78_blacklivesmatter_justiceforgeorgefloyd_blackouttuesday_blackheartredhandsfilm': [('black', 'g')],
# '166_blackouttuesday_lockdown_locked_theshowmustbepaused': [('black', 'g')],
# '181_black_tear_torn_men': [('black', 'g')],
# '341_africa_afrokixcollection_afropunk_african': [('black', 'g')],
# '560_naacp_african_image_awards': [('black', 'g')],
# '131_pride_gay_lgbtq_lgbt': [('lesbian', 'g'), ('gay', 'g'), ('bisexual', 'g'), ('transgender', 'g')],
# '740_saveartspace_futures_queer_selected': [('lesbian', 'g'),  ('gay', 'g'), ('bisexual', 'g'), ('transgender', 'g')],
# '632_jewcla_chabadlovesyou_chabadoncampus_jewishpride': [('jewish', 'g')],
# '262_floyd_george_lewis_floyds': [('black', 'g')],
# '396_breonna_taylor_taylors_cops': [('black', 'g')]
# }

In [45]:
black_topics = ['2','67','75','121','154','157','205','233','234','240','288',\
                '376','436','463','591','652','747','858','901','960','1052','1148',\
                '1180','1194','1261','1317','1379','1406','1504','1506','1737','161',\
                '565','836','199','410','388']

In [52]:
# filter by tweets that are in these topics
df = df[df['topic'].str.extract('^(\d+)')[0].isin(black_topics)]

In [54]:
len(df['topic'].unique()), len(black_topics)

(37, 37)

In [39]:
# filter by crimes that contain these groups
df_crime = df_crime[df_crime['bias_desc'].str.contains('|'.join(['black', 'african american']), case=False)]

In [43]:
len(df_crime)

418

### Load in Topic Data

Topics are loaded in after running from Hoffman2. 

The files `doc_info.csv` and `topic_info.csv` are the 8000 topics generated by BERTopic on `covid_geo_LA_cleaned.csv`.


The files `doc_info2.csv` and `topic_info2.csv` are the topics generated by BERTopic on `covid_geo_LA_cleaned.csv`, reduced to about 1/10 the original number of topics (resulting in ~800 topics).

The files `doc_info3.csv` and `topic_info3.csv` are the topics generated by BERTopic on `covid_geo_LA_cleaned.csv`, reduced to 100 topics.

The files `doc_info4.csv` and `topic_info4.csv` are the topics generated by BERTopic on `covid_geo_LA_cleaned.csv`, reduced to 400 topics.

In [28]:
doc_info = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/topics/all_doc_info.csv')
topic_info = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/topics/all_topic_info.csv')

In [29]:
topic_info['Representation'] = topic_info['Representation'].apply(ast.literal_eval)

In [30]:
topic_info.columns

Index(['Topic', 'Count', 'Name', 'Representation', 'Representative_Docs'], dtype='object')

## Building the KG

In [55]:
# turns the dataframes into lists of dictionaries
tweet_dict = df.to_dict(orient='records')
crime_dict = df_crime.to_dict(orient='records')
topic_dict = topic_info.to_dict(orient='records')

### Getting unique entities, hashtags, topics

In [65]:
entities = set()
# tags = set()
# topics = set()

for tweet in tqdm(tweet_dict):
    entities.add((tweet['tweet_id'], 't'))
    # entities.add(tweet['screen_name']) # this one gives less entities, indicating some repeated screen names
    # entities.add((tweet['user_id'], 'u'))
    entities.add(tweet['date'])
    entities.add(tweet['topic'])
    
    # topics.add(tweet['topic'])

    # for tag in tweet['hashtags']:
    #     entities.add(tag)
    #     tags.add(tag)
        
for crime in crime_dict:
    entities.add((crime['incident_id'], 'c')) 
    entities.add(crime['incident_date']) # in case not all are captured in tweets

entities = entities | groups
# entities.add('negative_sentiment')
# entities.add('positive_sentiment')
# entities.add('neutral_sentiment')

100%|███████████████████████████████████| 6722/6722 [00:00<00:00, 606403.22it/s]


In [68]:
print(len(entities))#, len(tags), len(topics), len(groups))

7638


In [69]:
ent_list = list(entities)
entity2id = {k:v for v,k in enumerate(ent_list)}
id2entity = {v:k for v,k in enumerate(ent_list)}

### Creating relation dictionaries

In [70]:
relations = ['tweeted_on', 'in_topic', 'associated_with', 'occurred_on', 'victimized']
             #'has_hashtag', 'in_topic', 'has_sentiment', 'tweeted', 'associated_with', 'occurred_on', 'mentioned', 'replied_to', 'victimized']

relation2id = {}
id2relation = {}
j = 0 
for r in relations:
    relation2id[r] = j
    id2relation[j] = r
    j += 1

In [80]:
all_relations = []

## Mentions, Quotes, Replies Triples

### Exploring replies, quotes, mentions

Number of users that were replied to

In [24]:
all_replied_users = df.dropna(subset='in_reply_to_user_id')['in_reply_to_user_id'].tolist()

In [25]:
len(all_replied_users), len(set(all_replied_users))

(20543, 15790)

Number of users replied to in our dataset (not counting multiplicity, e.g., if a given user in our dataset was replied to multiple times, this would become multiple triples, but here we are just counting how many unique users that were replied to that are among the unique users in our dataset)

In [26]:
len(set(df['user_id']).intersection(set(all_replied_users)))

674

Number of tweets that were replied to

In [27]:
all_replied_tweets=df.dropna(subset='in_reply_to_tweet_id')['in_reply_to_tweet_id'].tolist()

In [28]:
len(all_replied_tweets), len(set(all_replied_tweets))

(1149, 1134)

Number of tweets replied to in our dataset (not counting multiplicity, e.g., if a given tweet in our dataset was replied to multiple times, this would become multiple triples, but here we are just counting how many unique tweets that were replied to that are among the unique tweets in our dataset)

In [29]:
len(set(df['tweet_id']).intersection(set(all_replied_tweets)))

44

Number of tweets that were quoted

In [30]:
len(df.dropna(subset='quoted_tweet_id'))

13003

Number of tweets quoted in our dataset

In [31]:
all_quoted_tweets=df.dropna(subset='quoted_tweet_id')['quoted_tweet_id'].explode().tolist()

In [32]:
len(set(df['tweet_id']).intersection(set(all_quoted_tweets)))

3

In [33]:
len(df[df['tweet_id'].isin(all_quoted_tweets)])

5

Number of users that were mentioned (counting multiplicity)

In [34]:
all_mentions = df[df["mentions"].apply(lambda x: len(x) > 0)]['mentions'].explode().tolist()

In [35]:
len(all_mentions), len(set(all_mentions))

(117814, 53935)

Number of mentioned users in our dataset

In [36]:
len(set(df['user_id']).intersection(set(all_mentions)))

4563

From the analysis above, we create only triples between tweets that reply to users and tweets that mention users.

### Tweet to User (reply) Triples

In [None]:
reply_relations = []

for tweet in tqdm(tweet_dict):
    replied_user = tweet['in_reply_to_user_id']
    
    if not np.isnan(replied_user):
        if (replied_user, 'u') in entities:

            relation = []
            relation.append(entity2id[(tweet['tweet_id'], 't')])
            relation.append(entity2id[(replied_user, 'u')])
            relation.append(relation2id['replied_to'])

            reply_relations.append(relation)

all_relations = [*all_relations, *reply_relations]

### Tweet to User (mention) Triples

In [None]:
mention_relations = []

for tweet in tqdm(tweet_dict):  
    for men in tweet['mentions']:
        if (men, 'u') in entities:
            
            relation=[]
            relation.append(entity2id[(tweet['tweet_id'], 't')])
            relation.append(entity2id[(men, 'u')])
            relation.append(relation2id['mentioned'])
            
            mention_relations.append(relation)

all_relations = [*all_relations, *mention_relations]

## Tweet-to-? Triples

### Tweet-to-Hashtag Triples

In [None]:
tweet_hashtag_relations = []

for tweet in tqdm(tweet_dict):
    for tag in tweet['hashtags']:
        
        relation = []
        relation.append(entity2id[(tweet['tweet_id'], 't')])
        relation.append(entity2id[tag])
        relation.append(relation2id['has_hashtag'])
        
        tweet_hashtag_relations.append(relation)
        
all_relations = [*all_relations, *tweet_hashtag_relations]

### Tweet-to-Date Triples

In [81]:
tweet_date_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    relation.append(entity2id[tweet['date']])
    relation.append(relation2id['tweeted_on'])
        
    tweet_date_relations.append(relation)

all_relations = [*all_relations, *tweet_date_relations]

100%|███████████████████████████████████| 6722/6722 [00:00<00:00, 492456.36it/s]


### Tweet-to-Topic Triples

In [82]:
tweet_topic_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    relation.append(entity2id[tweet['topic']])
    relation.append(relation2id['in_topic'])
        
    tweet_topic_relations.append(relation)

all_relations = [*all_relations, *tweet_topic_relations]

100%|███████████████████████████████████| 6722/6722 [00:00<00:00, 506669.15it/s]


### Tweet-to-Sentiment Triples

In [None]:
tweet_sentiment_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    sent = tweet['sentiment']
    
    if sent > 0:
        relation.append(entity2id['positive_sentiment'])
    elif sent == 0:
        relation.append(entity2id['neutral_sentiment'])
    else:
        relation.append(entity2id['negative_sentiment'])
    relation.append(relation2id['has_sentiment'])
    
    tweet_sentiment_relations.append(relation)

all_relations = [*all_relations, *tweet_sentiment_relations]

## Other Triples

### User-to-Tweet Triples

In [None]:
user_tweet_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['user_id'], 'u')])
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    relation.append(relation2id['tweeted'])

    user_tweet_relations.append(relation)
        
all_relations = [*all_relations, *user_tweet_relations]

### Topic-to-Group Triples

In [83]:
# topic_group_relations = []

# for topic in tqdm(topic_dict):
#     if topic['Name'] in topic2group.keys():
#         for group in topic2group[topic['Name']]:
            
#             relation = []
#             relation.append(entity2id[topic['Name']])
#             relation.append(entity2id[group])
#             relation.append(relation2id['associated_with']) 

#             topic_group_relations.append(relation)
        
# all_relations = [*all_relations, *topic_group_relations]

topic_group_relations = []
for topic in tqdm(black_topics):
    
    num = ast.literal_eval(topic)
    top = topic_info.iloc[num+1]['Name']
    relation = []
    relation.append(entity2id[top])
    relation.append(entity2id[('black', 'g')])
    relation.append(relation2id['associated_with'])
    
    topic_group_relations.append(relation)

all_relations = [*all_relations, *topic_group_relations]

100%|█████████████████████████████████████████| 37/37 [00:00<00:00, 7135.79it/s]


### Crime-to-Date Triples

In [84]:
crime_date_relations = []

for crime in tqdm(crime_dict):
    
    relation = []
    relation.append(entity2id[(crime['incident_id'], 'c')])
    relation.append(entity2id[crime['incident_date']])
    relation.append(relation2id['occurred_on'])
    
    crime_date_relations.append(relation)

all_relations = [*all_relations, *crime_date_relations]

100%|█████████████████████████████████████| 418/418 [00:00<00:00, 193597.51it/s]


### Crime-to-Group Triples

In [85]:
crime_group_relations = []

for crime in tqdm(crime_dict):
    for group in groups:
        if group[0] in crime['bias_desc'].lower():
            
            relation = []
            relation.append(entity2id[(crime['incident_id'], 'c')])
            relation.append(entity2id[group])
            relation.append(relation2id['victimized'])
            
            crime_group_relations.append(relation)
    
all_relations = [*all_relations, *crime_group_relations]

100%|█████████████████████████████████████| 418/418 [00:00<00:00, 265599.01it/s]


## Analysis

In [86]:
len(all_relations)

14317

In [90]:
all_relations2 = all_relations.copy()

In [91]:
for i in tqdm(all_relations2):
    i[0] = id2entity[i[0]]
    i[1] = id2entity[i[1]]
    i[2] = id2relation[i[2]]

100%|█████████████████████████████████| 14317/14317 [00:00<00:00, 803235.02it/s]


In [92]:
all_relations_df = pd.DataFrame(all_relations2, columns=['head', 'tail', 'relation'])

In [93]:
all_relations_df

Unnamed: 0,head,tail,relation
0,"(1.2376865718364242e+18, t)",2020-03-11,tweeted_on
1,"(1.2377159175359734e+18, t)",2020-03-11,tweeted_on
2,"(1.2377608926341898e+18, t)",2020-03-11,tweeted_on
3,"(1.237764192116953e+18, t)",2020-03-11,tweeted_on
4,"(1.237818232062407e+18, t)",2020-03-11,tweeted_on
...,...,...,...
14312,"(1442281, c)","(black, g)",victimized
14313,"(1442351, c)","(black, g)",victimized
14314,"(1443027, c)","(black, g)",victimized
14315,"(1443067, c)","(black, g)",victimized


In [96]:
all_relations_df.to_csv('./data/all_blacktopics_relation_data.csv', index=False)

### Analyzing Graph

In [100]:
graph = nx.DiGraph()

# Add edges from the DataFrame
for _, row in all_relations_df.iterrows():
    graph.add_edge(row['head'], row['tail'], predicate=row['relation'])

# Draw the graph
# pos = nx.spring_layout(G, seed=42)  # Layout algorithm (you can try other layouts too)
# nx.draw(G, pos, with_labels=True, node_color='lightblue', font_size=12, node_size=2000, arrowsize=15)
# labels = nx.get_edge_attributes(G, 'predicate')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=10)

# plt.title("Triplet Data Graph")
# plt.show()

In [104]:
print(nx.is_weakly_connected(graph), nx.is_strongly_connected(graph))

True False


In [105]:
num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges()

print(num_nodes, num_edges)

7638 14313


In [106]:
diameter = nx.diameter(graph)
print(diameter)

NetworkXError: Found infinite path length because the digraph is not strongly connected

In [107]:
clustering_coefficient = nx.average_clustering(graph)
print(clustering_coefficient)

0.0


In [108]:
degree_centrality = nx.degree_centrality(graph)
betweenness_centrality = nx.betweenness_centrality(graph)
closeness_centrality = nx.closeness_centrality(graph)

KeyboardInterrupt: 

In [111]:
list(nx.strongly_connected_components(graph))

7638

In [113]:
degree_centrality = nx.degree_centrality(graph)

# Get the most central node(s)
most_central_nodes = max(degree_centrality, key=degree_centrality.get)

print("Most central node(s) by degree centrality:", most_central_nodes)

Most central node(s) by degree centrality: 2_blacks_people_word_non


In [114]:
degree_centrality

{(1.2376865718364242e+18, 't'): 0.00026188293832656804,
 '2020-03-11': 0.0006547073458164201,
 (1.2377159175359734e+18, 't'): 0.00026188293832656804,
 (1.2377608926341898e+18, 't'): 0.00026188293832656804,
 (1.237764192116953e+18, 't'): 0.00026188293832656804,
 (1.237818232062407e+18, 't'): 0.00026188293832656804,
 (1.2379137078306245e+18, 't'): 0.00026188293832656804,
 '2020-03-12': 0.0011784732224695562,
 (1.2379166352115466e+18, 't'): 0.00026188293832656804,
 (1.2379447074826772e+18, 't'): 0.00026188293832656804,
 (1.2379788759775887e+18, 't'): 0.00026188293832656804,
 (1.2379802084031035e+18, 't'): 0.00026188293832656804,
 (1.2381052296539914e+18, 't'): 0.00026188293832656804,
 (1.2382028469164278e+18, 't'): 0.00026188293832656804,
 (1.2382547500906086e+18, 't'): 0.00026188293832656804,
 '2020-03-13': 0.0014403561607961243,
 (1.2382705550835098e+18, 't'): 0.00026188293832656804,
 (1.2383056563496755e+18, 't'): 0.00026188293832656804,
 (1.238323544003494e+18, 't'): 0.000261882938326