In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly import express as px

import ast
import pickle
from tqdm import tqdm

import networkx as nx
import random
import json

from collections import Counter

## Load in Data

### Load in Tweet Data

In [3]:
df = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/bigbabies/bigbaby_r_t_30_s.csv')

In [4]:
# evaluating strings of lists as lists
df['hashtags'] = df['hashtags'].apply(ast.literal_eval)
df['mentions'] = df['mentions'].apply(ast.literal_eval)
# df['keywords'] = df['keywords'].apply(ast.literal_eval)

In [5]:
df.columns[:35]

Index(['created_at', 'tweet_id', 'text', 'in_reply_to_tweet_id',
       'in_reply_to_user_id', 'geo', 'quote_count', 'reply_count',
       'retweet_count', 'favorite_count', 'lang', 'quoted_tweet_id',
       'possibly_sensitive', 'user_id', 'screen_name', 'followers_count',
       'friends_count', 'statuses_count', 'verified', 'hashtags', 'mentions',
       'date', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12'],
      dtype='object')

### Load in Crime Data

Load in the hate crime `hatecrimedata.csv` dataframe.

In [6]:
df_crime = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/hatecrimedata.csv')

In [7]:
# limiting to only dates that have associated tweets
df_crime['incident_date'] = pd.to_datetime(df_crime['incident_date'])
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2021-06-17')

df_crime = df_crime[(df_crime['incident_date'] >= start_date) & (df_crime['incident_date'] <= end_date)]
df_crime['incident_date'] = pd.to_datetime(df_crime['incident_date']).dt.strftime('%Y-%m-%d')

In [8]:
df_crime = df_crime[df_crime['state_name'] == 'California']

In [9]:
df_crime.columns

Index(['incident_id', 'data_year', 'ori', 'pug_agency_name', 'pub_agency_unit',
       'agency_type_name', 'state_abbr', 'state_name', 'division_name',
       'region_name', 'population_group_code', 'population_group_description',
       'incident_date', 'adult_victim_count', 'juvenile_victim_count',
       'total_offender_count', 'adult_offender_count',
       'juvenile_offender_count', 'offender_race', 'offender_ethnicity',
       'victim_count', 'offense_name', 'total_individual_victims',
       'location_name', 'bias_desc', 'victim_types', 'multiple_offense',
       'multiple_bias'],
      dtype='object')

### Load in Topic Data

In [6]:
doc_info = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/topics/bigbaby_r_30_docs.csv')
top_info = pd.read_csv('/Users/jchang153/Documents/UCLA-CAM/GV KG/Data/Tweets/All/topics/bigbaby_r_30_tops.csv')

In [7]:
top_info['Representation'] = top_info['Representation'].apply(ast.literal_eval)

In [8]:
top_info.columns

Index(['Topic', 'Count', 'Name', 'Representation', 'Representative_Docs'], dtype='object')

In [9]:
top2name = top_info.set_index('Topic')['Name'].to_dict()

### Querying by a Given Group

Only black-related topics:

In [10]:
# [32,35,65,135,142]
black_topics = [1,2,3,4,7,8,9,11,13,15,19,20,21,23,25,27,31,34,36,38,40,41,42,44,45,46,47,48,50,
51,52,56,62,63,64,66,68,70,71,73,75,76,78,82,83,84,86,88,92,93,94,101,102,105,106,
107,109,111,114,117,119,120, 121,122,123,124,125,126,131,132,133,137,140,143,144,145,
146,147,149,150,151,153,154,155,157,160,162,163,164,165,166,168,169,171,172,177,178,179,
180,184,186,187,188,191,193,196,200,202,203,206,208,211,212,216,218,220,222,223,224,225,
226,228,229,230,233,235,242,244,245,250,251,253,254,255,256,259,260,261,266,268,269]

lgbt_topics = [28,26,58,175,57,209,22,53,168,180,18,156,182,104,215,247]

jewish_topics = [79,24,72,0,30,37,55,87,91,129,121,129,199,205,264,10,59,5,6,100,262]

asian_topics = [67,69,96,115,135,147,161,16,99,183,195,201,221,243,101]

hispanic_topics = [32,35,65,135,142]

top_list = black_topics + lgbt_topics + asian_topics

In [11]:
len(black_topics), len(lgbt_topics), len(asian_topics)

(141, 16, 15)

In [12]:
len(top_list)

172

In [17]:
len(df[df['topics'].isin(black_topics)]), len(df[df['topics'].isin(lgbt_topics)]), len(df[df['topics'].isin(asian_topics)])

(25467, 3088, 1846)

Only tweets who have at least 0.01 probability of being in one of the above topics:

In [16]:
df = df[df[[str(num) for num in top_list]].ge(0.01).any(axis=1)]
len(df)

51491

Only black crimes:

In [17]:
df_crime['bias_desc'].unique()

array(['Anti-Jewish', 'Anti-Gay (Male)', 'Anti-Black or African American',
       'Anti-Hindu', 'Anti-Transgender', 'Anti-Hispanic or Latino',
       'Anti-Multiple Races, Group', 'Anti-Asian',
       'Anti-Mental Disability', 'Anti-White',
       'Anti-Other Race/Ethnicity/Ancestry', 'Anti-Female',
       'Anti-Other Religion',
       'Anti-Lesbian, Gay, Bisexual, or Transgender (Mixed Group)',
       'Anti-Islamic (Muslim)', 'Anti-Arab', 'Anti-Lesbian (Female)',
       'Anti-Other Christian', 'Anti-Physical Disability',
       'Anti-Multiple Religions, Group', 'Anti-Sikh',
       'Anti-Gender Non-Conforming',
       'Anti-American Indian or Alaska Native', 'Anti-Bisexual',
       'Anti-Protestant', 'Anti-Eastern Orthodox (Russian, Greek, Other)',
       'Anti-Buddhist', 'Anti-Catholic', 'Anti-Church of Jesus Christ',
       'Anti-Black or African American;Anti-Hispanic or Latino',
       'Anti-Gay (Male);Anti-Hispanic or Latino',
       'Anti-Asian;Anti-Black or African American'], d

In [18]:
df_crime = df_crime[df_crime['bias_desc'].str.contains('Black')]

## Building the KG

In [19]:
# turns the dataframes into lists of dictionaries
tweet_dict = df.to_dict(orient='records')
crime_dict = df_crime.to_dict(orient='records')

### Getting unique entities, hashtags, topics

In [None]:
entities = set()
tags = set()
topics = set()

for tweet in tqdm(tweet_dict):
    entities.add((tweet['tweet_id'], 't'))
    # entities.add(tweet['screen_name']) # this one gives less entities, indicating some repeated screen names
    # entities.add((tweet['user_id'], 'u'))
    entities.add(tweet['date'])
    entities.add(tweet['topic'])
    
    # topics.add(tweet['topic'])

    # for tag in tweet['hashtags']:
    #     entities.add(tag)
    #     tags.add(tag)
        
for crime in crime_dict:
    entities.add((crime['incident_id'], 'c')) 
    entities.add(crime['incident_date']) # in case not all are captured in tweets

entities = entities | groups
# entities.add('negative_sentiment')
# entities.add('positive_sentiment')
# entities.add('neutral_sentiment')

In [18]:
entities = set()

for tweet in tqdm(tweet_dict):
    entities.add((tweet['tweet_id'], 't'))
    # consider adding: sentiment, quote_count, reply_count, retweet_count, favorite_count, lang, possibly_sensitive
    entities.add((tweet['user_id'], 'u'))
    # consider adding: followers_count, friends_count, statuses_count, verified
    entities.add(tweet['date'])
        
for crime in crime_dict:
    entities.add((crime['incident_id'], 'c')) 
    entities.add(crime['incident_date']) # in case not all are captured in tweets
    
for top in top_list:
    entities.add(top2name[top])

100%|███████████████████████████████████| 2959/2959 [00:00<00:00, 158278.65it/s]


In [19]:
print(len(entities))

5825


In [20]:
ent_list = list(entities)
entity2id = {k:v for v,k in enumerate(ent_list)}
id2entity = {v:k for v,k in enumerate(ent_list)}

### Creating relation dictionaries

In [21]:
relations = ['tweeted_on', 'tweeted', 'in_topic', 'mentioned', 'replied_to', 'occurred_on']

relation2id = {}
id2relation = {}
j = 0 
for r in relations:
    relation2id[r] = j
    id2relation[j] = r
    j += 1

In [22]:
all_relations = []

## Mentions, Quotes, Replies Triples

### Exploring replies, quotes, mentions

Number of users that were replied to

In [125]:
all_replied_users = df.dropna(subset='in_reply_to_user_id')['in_reply_to_user_id'].tolist()

In [126]:
len(all_replied_users), len(set(all_replied_users))

(12202, 8729)

Number of users replied to in our dataset (not counting multiplicity, e.g., if a given user in our dataset was replied to multiple times, this would become multiple triples, but here we are just counting how many unique users that were replied to that are among the unique users in our dataset)

In [130]:
len(set(df['user_id']).intersection(set(all_replied_users)))

1249

Number of users replied to in our dataset, counting multiplicity (this is the number of relations added to the KG)

In [156]:
# Count the multiplicity of each user in the list
user_counts = Counter(all_replied_users)

# Find the intersection of the list and set of users
intersection_users = set(all_replied_users) & set(df['user_id'])

# Count the multiplicity for each user in the intersection
intersection_counts = {user: user_counts[user] for user in intersection_users}

print(sum(intersection_counts.values()))

1991


Number of tweets that were replied to

In [127]:
all_replied_tweets=df.dropna(subset='in_reply_to_tweet_id')['in_reply_to_tweet_id'].tolist()

In [128]:
len(all_replied_tweets), len(set(all_replied_tweets))

(11226, 10976)

Number of tweets replied to in our dataset (not counting multiplicity, e.g., if a given tweet in our dataset was replied to multiple times, this would become multiple triples, but here we are just counting how many unique tweets that were replied to that are among the unique tweets in our dataset)

In [129]:
len(set(df['tweet_id']).intersection(set(all_replied_tweets)))

212

In [157]:
# Count the multiplicity of each user in the list
user_counts = Counter(all_replied_tweets)

# Find the intersection of the list and set of users
intersection_users = set(all_replied_tweets) & set(df['tweet_id'])

# Count the multiplicity for each user in the intersection
intersection_counts = {user: user_counts[user] for user in intersection_users}

print(sum(intersection_counts.values()))

215


Number of tweets that were quoted

In [132]:
all_quoted_tweets = df.dropna(subset='quoted_tweet_id')['quoted_tweet_id'].tolist()

In [133]:
len(all_quoted_tweets), len(set(all_quoted_tweets))

(8984, 8178)

Number of tweets quoted in our dataset

In [134]:
len(set(df['tweet_id']).intersection(set(all_quoted_tweets)))

47

In [158]:
# Count the multiplicity of each user in the list
user_counts = Counter(all_quoted_tweets)

# Find the intersection of the list and set of users
intersection_users = set(all_quoted_tweets) & set(df['tweet_id'])

# Count the multiplicity for each user in the intersection
intersection_counts = {user: user_counts[user] for user in intersection_users}

print(sum(intersection_counts.values()))

50


Number of users that were mentioned (counting multiplicity)

In [136]:
all_mentions = df[df["mentions"].apply(lambda x: len(x) > 0)]['mentions'].explode().tolist()

In [137]:
len(all_mentions), len(set(all_mentions))

(23195, 13011)

Number of mentioned users in our dataset

In [138]:
len(set(df['user_id']).intersection(set(all_mentions)))

226

In [159]:
# Count the multiplicity of each user in the list
user_counts = Counter(all_mentions)

# Find the intersection of the list and set of users
intersection_users = set(all_mentions) & set(df['user_id'])

# Count the multiplicity for each user in the intersection
intersection_counts = {user: user_counts[user] for user in intersection_users}

print(sum(intersection_counts.values()))

461


**From the analysis above, we will proceed by creating only triples between tweets that reply to users and tweets that mention users.**

### Tweet to User (reply) Triples

In [23]:
reply_relations = []

for tweet in tqdm(tweet_dict):
    replied_user = tweet['in_reply_to_user_id']
    
    if not np.isnan(replied_user):
        if (replied_user, 'u') in entities:

            relation = []
            relation.append(entity2id[(tweet['tweet_id'], 't')])
            relation.append(entity2id[(replied_user, 'u')])
            relation.append(relation2id['replied_to'])
            # relation.append(1.0)

            reply_relations.append(relation)

all_relations = [*all_relations, *reply_relations]

100%|███████████████████████████████████| 2959/2959 [00:00<00:00, 287863.47it/s]


In [24]:
len(all_relations)

87

### Tweet to User (mention) Triples

In [25]:
mention_relations = []

for tweet in tqdm(tweet_dict):  
    for men in tweet['mentions']:
        if (men, 'u') in entities:
            
            relation=[]
            relation.append(entity2id[(tweet['tweet_id'], 't')])
            relation.append(entity2id[(men, 'u')])
            relation.append(relation2id['mentioned'])
            # relation.append(1.0)
            
            mention_relations.append(relation)

all_relations = [*all_relations, *mention_relations]

100%|███████████████████████████████████| 2959/2959 [00:00<00:00, 451487.71it/s]


In [26]:
len(all_relations)

96

## Tweet-to-? Triples

### Tweet-to-Date Triples

In [27]:
tweet_date_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    relation.append(entity2id[tweet['date']])
    relation.append(relation2id['tweeted_on'])
    # relation.append(1.0)
        
    tweet_date_relations.append(relation)

all_relations = [*all_relations, *tweet_date_relations]

100%|███████████████████████████████████| 2959/2959 [00:00<00:00, 340857.04it/s]


In [28]:
len(all_relations)

3055

### Tweet-to-Topic Triples

In [29]:
tweet_topic_relations = []

for tweet in tqdm(tweet_dict):
    for top in top_list:
        if tweet[f'{top}'] >= 0.01:
            
            relation = []
            relation.append(entity2id[(tweet['tweet_id'], 't')])
            relation.append(entity2id[top2name[top]])
            relation.append(relation2id['in_topic'])
            relation.append(tweet[f'{top}'])
            
            tweet_topic_relations.append(relation)

all_relations = [*all_relations, *tweet_topic_relations]

100%|███████████████████████████████████| 2959/2959 [00:00<00:00, 118218.62it/s]


In [30]:
len(all_relations)

6225

## Other Triples

### User-to-Tweet Triples

In [31]:
user_tweet_relations = []

for tweet in tqdm(tweet_dict):
        
    relation = []
    relation.append(entity2id[(tweet['user_id'], 'u')])
    relation.append(entity2id[(tweet['tweet_id'], 't')])
    relation.append(relation2id['tweeted'])
    # relation.append(1.0)

    user_tweet_relations.append(relation)
        
all_relations = [*all_relations, *user_tweet_relations]

100%|████████████████████████████████████| 2959/2959 [00:00<00:00, 39549.36it/s]


In [32]:
len(all_relations)

9184

### Crime-to-Date Triples

In [241]:
crime_date_relations = []

for crime in tqdm(crime_dict):
    
    relation = []
    relation.append(entity2id[(crime['incident_id'], 'c')])
    relation.append(entity2id[crime['incident_date']])
    relation.append(relation2id['occurred_on']) 
    relation.append(1.0)
    
    crime_date_relations.append(relation)

all_relations = [*all_relations, *crime_date_relations]

100%|█████████████████████████████████████| 418/418 [00:00<00:00, 383888.56it/s]


## Saving

In [33]:
len(all_relations)

9184

In [34]:
all_relations2 = []

In [35]:
for trip in tqdm(all_relations):
    relation = []
    if isinstance(id2entity[trip[0]], tuple):
        relation.append((int(id2entity[trip[0]][0]), id2entity[trip[0]][1]))
    else:    
        relation.append(id2entity[trip[0]])
    
    if isinstance(id2entity[trip[1]], tuple):
        relation.append((int(id2entity[trip[1]][0]), id2entity[trip[1]][1]))
    else:
        relation.append(id2entity[trip[1]])
    relation.append(id2relation[trip[2]])
    try:
        relation.append(trip[3])
    except:
        relation.append(np.nan)
    
    all_relations2.append(relation)

100%|███████████████████████████████████| 9184/9184 [00:00<00:00, 360310.99it/s]


In [36]:
all_relations_df = pd.DataFrame(all_relations2, columns=['head', 'tail', 'relation', 'weight'])

In [37]:
all_relations_df

Unnamed: 0,head,tail,relation,weight
0,"(1239576949934047232, t)","(2808590695, u)",replied_to,
1,"(1240535441847095296, t)","(1060564414116450432, u)",replied_to,
2,"(1243967634057510912, t)","(3255342379, u)",replied_to,
3,"(1245940455558098944, t)","(1021673051409117184, u)",replied_to,
4,"(1253083733269213184, t)","(43906698, u)",replied_to,
...,...,...,...,...
9179,"(28959352, u)","(1405269981244248064, t)",tweeted,
9180,"(43989926, u)","(1405316297261342720, t)",tweeted,
9181,"(22114879, u)","(1405321378530684928, t)",tweeted,
9182,"(1398721996502884352, u)","(1405342254370222080, t)",tweeted,


In [38]:
all_relations_df.to_csv('./data/kg_hispanic.csv', index=False)