In [2]:
import ujson 
import operator
import os
from collections import *
import pandas as pd
import networkx as nx
from datetime import datetime
import glob
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import logit, glm
import numpy as np
import pandas

ModuleNotFoundError: No module named 'networkx'

In [2]:
df = pd.read_csv('/shared/0/projects/cross-lingual-exchange/data/dataframes/es_pt.tsv', sep='\t', 
                 names=['user_id','lang', 'bilinguality', 'country', 'betw', 'num_tweets', 
                        'followers', 'following', 'hashtag_count', 'url_count', 'degree'])
df.head()

Unnamed: 0,user_id,lang,bilinguality,country,betw,num_tweets,followers,following,hashtag_count,url_count,degree
0,231824167,es,1.0,Spain,6.06156e-06,3,575,744,0,1,52
1,463643987,es,1.0,Spain,5.090252e-07,6,153,322,1,2,14
2,463643665,es,1.0,Spain,3.128218e-07,12,290,106,2,9,32
3,463642743,es,1.0,Spain,1.031442e-07,6,513,1349,0,3,8
4,1854581233,es,1.0,Spain,6.04668e-08,24,338,171,0,6,10


In [3]:
df.set_index('user_id', inplace=True)
df.head()

Unnamed: 0_level_0,lang,bilinguality,country,betw,num_tweets,followers,following,hashtag_count,url_count,degree
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
231824167,es,1.0,Spain,6.06156e-06,3,575,744,0,1,52
463643987,es,1.0,Spain,5.090252e-07,6,153,322,1,2,14
463643665,es,1.0,Spain,3.128218e-07,12,290,106,2,9,32
463642743,es,1.0,Spain,1.031442e-07,6,513,1349,0,3,8
1854581233,es,1.0,Spain,6.04668e-08,24,338,171,0,6,10


In [None]:
g = nx.Graph()

with open('/shared/0/projects/cross-lingual-exchange/data/network_subsets/ES_PT.tsv') as f:
    for line in f:
        uid1 = line.split(' ')[0].strip()
        uid2 = line.split(' ')[1].strip()
        
        g.add_edge(uid1, uid2)

In [None]:
hashtag_tweets = {}

os.chdir('/shared/0/projects/cross-lingual-exchange/data/hashtags/ES_PT/')

for file in glob.glob('*.txt'):
    es_tweets = []
    pt_tweets = []
    
    es_users = set()
    pt_users = set()
    
    hashtag = file.split('_')[0].strip()
    
    if hashtag not in hashtag_tweets:
        hashtag_tweets[hashtag] = {'es_tweets': [], 'pt_tweets': [], 'es_users': es_users, 'pt_users': pt_users}
        
    with open(file, 'r') as f:
        for line in f:
            tweet = ujson.loads(line)

            if tweet['lang'] == 'es':
                es_users.add(tweet['user']['id'])
                hashtag_tweets[hashtag]['es_tweets'].append(tweet)

            if tweet['lang'] == 'pt':
                pt_users.add(tweet['user']['id'])
                hashtag_tweets[hashtag]['pt_tweets'].append(tweet)

In [None]:
for entity, tweet_sets in tqdm_notebook(hashtag_tweets.items()):
    print("{}: es = {} tweets, pt = {} tweets".format(entity, len(tweet_sets['es_tweets']), len(tweet_sets['pt_tweets'])))

In [None]:
hashtag_to_pairs = []
potential_bridges = set()

df_dict = df.to_dict(orient='index')

for user, user_data in df_dict.items():
    if user_data['lang'] == 'BI':
        potential_bridges.add(user)

for entity, tweet_sets in tqdm_notebook(hashtag_tweets.items()):
    print("{}: es = {} tweets, pt = {} tweets".format(entity, len(tweet_sets['es_tweets']), len(tweet_sets['pt_tweets'])))
    for es_tweet in tweet_sets['es_tweets']:
        source = es_tweet['user']['id']
       
        if source in df_dict:
            for bridge in potential_bridges:
                if (str(bridge) in g[str(source)] and bridge not in es_users 
                        and bridge not in pt_users):
                    hashtag_to_pairs.append({'source': source, 'bridge': bridge, 'entity': entity, 'tweet': es_tweet})
                
    for pt_tweet in tweet_sets['pt_tweets']:
        source = pt_tweet['user']['id']
        
        if source in df_dict:
            for bridge in potential_bridges:
                if (str(bridge) in g[str(source)] and bridge not in es_users 
                        and bridge not in pt_users):
                    hashtag_to_pairs.append({'source': source, 'bridge': bridge, 'entity': entity, 'tweet': pt_tweet})

In [None]:
df['friends_with_bridge'] = 0

for user in df.index.values:
    for friend in g[str(user)]:
        if int(friend) in potential_bridges:
            df.loc[user, 'friends_with_bridge'] = 1

matching_df = df[df.lang=='pt']
matching_df.to_csv('/shared/0/projects/cross-lingual-exchange/data/dataframes/matching_test.tsv', sep='\t')

In [12]:
matched_df = pd.read_csv('/shared/0/projects/cross-lingual-exchange/data/dataframes/matched_df.tsv', sep='\t')
matched_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,lang,bilinguality,country,betw,num_tweets,followers,following,hashtag_count,url_count,degree,friends_with_bridge,pr_score
0,1,1390470768,pt,0.0,Spain,4.23236e-09,2,834,524,0,1,14,0,0.689789
1,2,463175344,pt,0.0,Spain,7.321351e-08,20,218,200,0,20,16,0,0.629314
2,3,462874840,pt,0.0,Spain,1.634605e-06,9,402,277,0,9,56,0,0.924092
3,4,462728785,pt,0.0,Spain,8.880849e-07,7,339,334,0,7,34,0,0.809824
4,5,462708801,pt,0.0,Spain,1.191815e-07,20,425,1027,0,20,28,1,0.739506


In [13]:
matched_df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
matched_df.set_index('index', inplace=True)
matched_df.head()

Unnamed: 0_level_0,user_id,lang,bilinguality,country,betw,num_tweets,followers,following,hashtag_count,url_count,degree,friends_with_bridge,pr_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1390470768,pt,0.0,Spain,4.23236e-09,2,834,524,0,1,14,0,0.689789
2,463175344,pt,0.0,Spain,7.321351e-08,20,218,200,0,20,16,0,0.629314
3,462874840,pt,0.0,Spain,1.634605e-06,9,402,277,0,9,56,0,0.924092
4,462728785,pt,0.0,Spain,8.880849e-07,7,339,334,0,7,34,0,0.809824
5,462708801,pt,0.0,Spain,1.191815e-07,20,425,1027,0,20,28,1,0.739506


In [15]:
entity_table = {'id': [], 'hashtag': [], 'condition': [], 'did_tweet': [], 'entity_type': []}

for hashtag, tweets in hashtag_tweets.items(): 
    for index, row in matched_df.iterrows():
        entity_table['id'].append(row['user_id'])
        entity_table['hashtag'].append(hashtag)
        
        if row['friends_with_bridge'] == 1:
            hashtag_table['condition'].append('target')
        
        if row['user_id'] in tweets['pt_users']:
            entity_table['did_tweet'].append(1)
        else:
            entity_table['did_tweet'].append(0)

México


KeyError: 'pt_users'