In [None]:
! pip install --quiet langdetect
! pip install networkx

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Edge Network Tweets") \
    .config("spark.jars", "/home/jovyan/work/gcs-connector-hadoop2-latest.jar") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/home/jovyan/work/key.json") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [2]:
# tweets = spark.read.parquet('gs://spain-tweets/rehydrated/tweets-2017-10-16')
month = '201806'
tweets = spark.read.parquet('gs://spain-tweets/rehydrated/lake').where(f'month = {month}')

In [None]:
tweets.createOrReplaceTempView('tweets')
tweets.printSchema()

In [6]:
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from itertools import permutations

In [7]:
def confident_lang(text):
    try: 
        langs = detect_langs(text)
        top = langs[0]
        if top.prob > 0.75:
            return top.lang
        elif top.lang == 'cat' or top.lang == 'es':
            # print(f'could not find language.\n Probs: {langs}.\n Text: {text}')
            return None
    except LangDetectException:
        return None

In [7]:
df = spark.sql('with x as (with t as (select rehydrated.id_str as id_str, rehydrated.user.id_str as user, struct(rehydrated.retweeted_status.text as text, rehydrated.retweeted_status.id_str, rehydrated.retweeted_status.user.id_str as user) as retweeted_status from tweets where rehydrated.retweeted_status is not null) select user, collect_list(retweeted_status) as retweets from t group by user) select * from x where size(retweets) > 1')

In [8]:
df.count()

25826

In [9]:
df.printSchema()

root
 |-- user: string (nullable = true)
 |-- retweets: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user: string (nullable = true)



In [10]:
def detect_and_filter(retweets):
    retweets = [(confident_lang(t['text']), t) for t in retweets]
    return [{**t, 'lang': lang} for lang,t in retweets if lang is not None]

In [11]:
dat = df.rdd \
        .map(lambda r: r.asDict(True)) \
        .map(lambda d: {**d, 'retweets': detect_and_filter(d['retweets'])}) \
        .filter(lambda d: len(d['retweets']) > 1) \
        .cache()

In [12]:
# 201710, 1%: 48518

dat.count()

22812

In [13]:
tweets = dat \
    .flatMap(lambda d: [(r['id_str'], {'retweet': r, 'users': [d['user']], 'count': 1}) 
                        for r in d['retweets']]) \
    .reduceByKey(lambda a,b: {**a, 'users': a['users'] + b['users'], 'count': a['count'] + b['count']}) \
    .map(lambda t: t[1]) \
    .filter(lambda t: t['count'] > 1) \
    .map(lambda t: {**t['retweet'], 'users': t['users'], 'count': t['count']}) \
    .cache()

In [14]:
tweet_nodes = tweets.collect()
tweet_ids = set([n['id_str'] for n in tweet_nodes])

tweet_edges = dat \
    .flatMap(lambda d: permutations([r['id_str'] for r in d['retweets']], 2)) \
    .filter(lambda t: t[0] in tweet_ids and t[1] in tweet_ids) \
    .map(lambda t: (t, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .collect()

In [15]:
len(tweet_edges)

14895

In [16]:
len(tweet_nodes)

5403

In [58]:
def pmf(items):
    d = {}
    for i in items:
        d[i] = d.get(i, 0) + 1
    tot = sum(d.values())
    return {k:v/tot for k,v in d.items()}

def user_lang(di):
    if di.get('ca', 0) > 0.10:
        return 'ca'
    
    lang,val = None,0

    for k,v in di.items():
        if v > val:
            lang,val = k,v

    return lang

user_nodes = tweets \
    .flatMap(lambda d: [(u, {'langs': [d['lang']], 'count': 1}) for u in d['users']]) \
    .reduceByKey(lambda a,b: {'langs': a['langs'] + b['langs'], 'count': a['count'] + b['count']}) \
    .map(lambda a: {'user': a[0], 'langs': a[1]['langs'], 'count': a[1]['count']}) \
    .map(lambda u: {**u, 'langs': pmf(u['langs'])}) \
    .map(lambda u: {**u, 'lang': user_lang(u['langs'])}) \
    .collect()

user_edges = tweets \
    .flatMap(lambda d: permutations(d['users'], 2)) \
    .map(lambda t: (t, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .collect()

In [59]:
len(user_nodes)

37424

In [60]:
len(user_edges)

1275139

In [63]:
nodes = user_nodes
edges = user_edges

for n in nodes: 
    G.add_node(n['user'], lang = n['lang'], count = n['count'])

for edge, weight in edges:
    G.add_edge(*edge, weight=weight)

In [66]:
nx.write_graphml(G, './usergraph.graphml')

In [64]:
len(G)

37424

In [None]:
pairs.take(10)

In [52]:


list(permutations([r['id_str'] for r in dat[0]['retweets']], 2))

[('914936500504158208', '914845655507591168'),
 ('914936500504158208', '922204986792177664'),
 ('914845655507591168', '914936500504158208'),
 ('914845655507591168', '922204986792177664'),
 ('922204986792177664', '914936500504158208'),
 ('922204986792177664', '914845655507591168')]

In [45]:
len([i for u in dat for i in u['retweets']])

162205