# Correlation experiment


In [8]:
from pyspark.sql import SparkSession
import json

In [2]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://sp-master:7077") \
        .appName("correlation_experiment")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.memory","2g")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .config("spark.shuffle.service.enabled", "false")\
        .config("spark.dynamicAllocation.enabled", "false")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/07 12:44:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load data from HDFS

In [38]:
json_lines = spark_session.sparkContext.textFile("hdfs://sp-master:9000/reddit_comment_data/RC_2011-01_small.json")

                                                                                

In [50]:
json_lines.take(1)

['{"archived":true,"downs":0,"link_id":"t3_etyqc","score_hidden":false,"id":"c1b06fp","author_flair_css_class":null,"body":"They should add that to the instructions on the box :p","ups":1,"distinguished":null,"gilded":0,"edited":false,"retrieved_on":1426664469,"parent_id":"t1_c1azvxa","created_utc":"1293840000","subreddit":"sex","controversiality":0,"author_flair_text":null,"score":1,"name":"t1_c1b06fp","author":"SandRider","subreddit_id":"t5_2qh3p"}']

# Convert each line to JSON/Dictionary objects

In [51]:
json_objs = json_lines.map(lambda line: json.loads(line))

In [52]:
json_objs.take(1)

[{'archived': True,
  'downs': 0,
  'link_id': 't3_etyqc',
  'score_hidden': False,
  'id': 'c1b06fp',
  'author_flair_css_class': None,
  'body': 'They should add that to the instructions on the box :p',
  'ups': 1,
  'distinguished': None,
  'gilded': 0,
  'edited': False,
  'retrieved_on': 1426664469,
  'parent_id': 't1_c1azvxa',
  'created_utc': '1293840000',
  'subreddit': 'sex',
  'controversiality': 0,
  'author_flair_text': None,
  'score': 1,
  'name': 't1_c1b06fp',
  'author': 'SandRider',
  'subreddit_id': 't5_2qh3p'}]

# Extract subreddit and author

In [162]:
# We also remove authors named "[deleted]" here.
subreddit_and_author = json_objs.map(lambda obj: (obj["subreddit"], obj["author"]))\
    .filter(lambda sa: sa[1] != "[deleted]")

In [163]:
subreddit_and_author.take(20)

[('sex', 'SandRider'),
 ('relationship_advice', 'throwaway-o'),
 ('DebateAChristian', 'Basilides'),
 ('scifi', 'zachm'),
 ('Seattle', 'BarbieDreamHearse'),
 ('google', 'eroq'),
 ('gaming', 'ramp_tram'),
 ('gaming', 'RevLoki'),
 ('lists', 'xsvfan'),
 ('atheism', 'Helen_A_Handbasket'),
 ('funny', 'lanedek'),
 ('politics', 'mothereffingteresa'),
 ('netsec', 'grutz'),
 ('gaming', 'MainlandX'),
 ('Art', 'fricken'),
 ('techsupport', 'megadert'),
 ('beer', 'DamnJester'),
 ('funny', 'cole1114'),
 ('funny', 'broken189'),
 ('WTF', 'pi_over_3')]

# Group authors by subreddit

In [164]:
grouped = subreddit_and_author.groupBy(lambda sa: sa[0])\
    .map(lambda sl: (sl[0], list(sl[1])))\
    .map(lambda sl: (sl[0], list(map(lambda t: t[1], sl[1]))))

In [166]:
grouped.take(1)

[('relationship_advice', ['throwaway-o', 'throwaway-o', 'throwaway-o'])]

# Remove duplicate authors

In [167]:
grouped_authors = grouped.map(lambda sas: (sas[0], list(dict.fromkeys(sas[1]))))

In [172]:
grouped_authors.take(1)

[('relationship_advice', ['throwaway-o'])]

# Compare subreddit users to all other subreddits and find out which subreddits have the most users in common

Data format: (subreddit1, subreddit2, same_authors 1000)

In [206]:
sorted_by_popularity = grouped_authors.sortBy(lambda sas: len(sas[1]), False)
sorted_by_popularity_copy = sorted_by_popularity

In [212]:
sorted_by_popularity.take(2)[1]

('AskReddit',
 ['Peritract',
  'Pooh_Bear',
  'nannerpus',
  'swaggerstagger',
  'armchairnixon',
  'heyfella',
  'rockychunk',
  'oniTony',
  'monsterburg',
  'Pudd1nPants',
  'lostelf2207',
  'peno_asslace',
  'handsolo',
  'midgeness',
  'Pyorrhea',
  'mobeat-rice',
  'lucasritting',
  'CarolinaDO',
  'insomniafox',
  'osama_bin_awesome',
  'Ares__',
  'alettuce',
  'elshizzo',
  'coffeeisblack',
  'spaghettios',
  'MyNameIsOhm',
  'barefoot_yank',
  'zerbey',
  'garugaga',
  'KibblesnBitts',
  'MCA2142',
  'rirj',
  'Kancho_Ninja',
  'talonverdugo',
  'subtonix',
  'bigtallsob',
  'Marowak',
  'dreamleaking',
  'stargaze',
  'uhadmeathello',
  'DustyDGAF',
  'rhambling',
  'gibson85',
  'marshmallowlolita',
  'hereboy',
  'Helen_A_Handbasket',
  'kylepm',
  'fm909',
  'DomesticatedCreeper',
  'halright',
  'drinktobones',
  'Malabo',
  'batsignal_to_mars',
  'tjfcu'])

In [None]:
# for every subreddit
# sorted_by_popularity.map(lambda sl: (sl[0], other_subreddit, count_common_users(list1, list2)))