In [43]:
import numpy as np
import seaborn as sns; sns.set()
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import ast

%load_ext autoreload
%autoreload 2


path_root = '/mnt/TERA/Data/reddit_topics'
path_data = join(path_root, 'safe_links_all')



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data mangling:

## Load, filter and save as CSV:

In [44]:
image_types = ['.jpg', '.png']
df = []
with open(path_data, 'r') as f:
    for n, line in enumerate(f):
        #print(f'\r{n}', end='')
        try:
            lst = ast.literal_eval(line)  # [subreddit, submission title, submitted link, comments link, short name]
        except ValueError:
            #print('ValueError')
            continue
        if any([x in lst[2] for x in image_types]): 
            df.append(lst)
        if len(df) > 10000:  # testing with a lot smaller dataset first
            break
            
df = pd.DataFrame(df)
df.columns = ['subreddit', 'submission_title', 'submission_link', 'comments_link', 'short_name']
df.head()

# Save as CSV:
df.to_csv(join(path_root, 'img_reddits.csv'), index=False)

In [45]:
df.head()

Unnamed: 0,subreddit,submission_title,submission_link,comments_link,short_name
0,funny,ITT: Things you hate that everyone else likes....,http://i.imgur.com/xvCP4.jpg,/r/funny/comments/eut3m/itt_things_you_hate_th...,t3_eut3m
1,WTF,This is the picture they're using to recruit p...,http://i.imgur.com/QDmzn.jpg,/r/WTF/comments/eut3k/this_is_the_picture_they...,t3_eut3k
2,funny,The Businessman Game - search Google Images fo...,http://www.customs.govt.nz/NR/rdonlyres/8F5ECF...,/r/funny/comments/eut35/the_businessman_game_s...,t3_eut35
3,fffffffuuuuuuuuuuuu,"two people, one shitter...",http://i.imgur.com/50VPz.png,/r/fffffffuuuuuuuuuuuu/comments/eut2n/two_peop...,t3_eut2n
4,wow,Wife is in bed early....CRAPFUCKSHIT...AAAAAAA...,http://i.imgur.com/94ZXF.jpg,/r/wow/comments/eut2j/wife_is_in_bed_earlycrap...,t3_eut2j


## Lemmatize:

In [69]:
df = pd.read_csv(join(path_root, 'img_reddits.csv'))

df = df[['subreddit', 'submission_title']]
df.head()

Unnamed: 0,subreddit,submission_title
0,funny,ITT: Things you hate that everyone else likes....
1,WTF,This is the picture they're using to recruit p...
2,funny,The Businessman Game - search Google Images fo...
3,fffffffuuuuuuuuuuuu,"two people, one shitter..."
4,wow,Wife is in bed early....CRAPFUCKSHIT...AAAAAAA...


In [None]:
import spacy
nlp = spacy.load('en')

def lemmatizer(string):
    lst = []
    doc = nlp(string)
    for token in doc:
        if not token.is_stop and token.is_alpha and token.lemma_ != '-PRON-':  # TODO: fix, dirty!
            lst.append(token.lemma_)
            
    return lst

submission_titles = df['submission_title'].apply(lemmatizer)  # 1 min for 10k sentences!!
df['submission_title'] = submission_titles
df.head()

In [88]:
# only subreddits with > min_posts posts:

min_posts = 100

top_subreddits = df['subreddit'].loc[(df['subreddit'].value_counts() > min_posts).values].unique()
print(top_subreddits)

df_top = df.loc[df.subreddit.isin(top_subreddits)]
df_top.head()

['funny' 'WTF' 'fffffffuuuuuuuuuuuu' 'wow' 'AdviceAnimals' 'reddit.com'
 'pics']


Unnamed: 0,subreddit,submission_title
0,funny,"[itt, thing, hate, like, start]"
1,WTF,"[this, picture, recruit, police, brutality, ab..."
2,funny,"[the, businessman, game, search, google, image..."
3,fffffffuuuuuuuuuuuu,"[people, shitter]"
4,wow,"[wife, bed, early, crapfuckshit, aaaaaaa, sfw]"


In [162]:
# top N most common keywords per subreddit:
n_keywords = 10

top_kws = df_top.groupby('subreddit').sum()

def count_words(lst_of_strs, top_n=10):
    #print(lst_of_strs)
    word_counts = dict()
    for word in lst_of_strs:
        if word not in word_counts:
            word_counts[word] = 1
        else:
            word_counts[word] += 1
            
    # Sort:
    word_counts = {word: word_counts[word] for word in sorted(word_counts, key=word_counts.get, reverse=True)}
    
    # top_n:
    word_counts = {k: word_counts[k] for k in list(word_counts)[:top_n]}
            
    return word_counts

# test:
cnts = count_words(top_kws.loc['AdviceAnimals'])
cnts

TypeError: unhashable type: 'list'

In [155]:
cnts = count_words(top_kws.loc['funny'].values)
cnts

{'the': 33,
 'like': 29,
 'pic': 23,
 'know': 22,
 'this': 21,
 'dad': 20,
 'a': 19,
 'think': 19,
 'get': 19,
 'new': 18}

In [164]:
df_top['submission_title'].apply(count_words).loc[0]

{'itt': 1, 'thing': 1, 'hate': 1, 'like': 1, 'start': 1}

In [130]:
cnts

{'hood': 17,
 'dad': 17,
 'sap': 16,
 'the': 12,
 'oblivious': 11,
 'time': 10,
 'fbf': 9,
 'high': 9,
 'new': 9,
 'this': 8}

In [114]:
sorted(x.items(), key=lambda kv: kv[1])

['hood',
 'dad',
 'sap',
 'the',
 'oblivious',
 'time',
 'fbf',
 'high',
 'new',
 'this',
 'paranoid',
 'pp',
 'bad',
 'parrot',
 'know',
 'mom',
 'son',
 'yo',
 'frog',
 'philosoraptor',
 'think',
 'way',
 'soccer',
 'clean',
 'bachelor',
 'grader',
 'lonely',
 'pug',
 'say',
 'wolf',
 'penguin',
 'happen',
 'let',
 'bro',
 'all',
 'successful',
 'asian',
 'father',
 'foul',
 'year',
 'good',
 'hot',
 'hygiene',
 'musically',
 'awkward',
 'girl',
 'get',
 'friend',
 'school',
 'see',
 'socially',
 'do',
 'talk',
 'influence',
 'make',
 'take',
 'go',
 'what',
 'every',
 'morning',
 'day',
 'workaholic',
 'chick',
 'love',
 'no',
 'one',
 'store',
 'at',
 'like',
 'to',
 'pay',
 'nihilism',
 'narwhal',
 'need',
 'why',
 'courage',
 'not',
 'phone',
 'man',
 'create',
 'gaming',
 'gopher',
 'trouble',
 'fuck',
 'obvious',
 'use',
 'image',
 'niger',
 'just',
 'case',
 'musician',
 'window',
 'bitch',
 'finish',
 'dinner',
 'happy',
 'expectations',
 'fix',
 'mind',
 'attempt',
 'imma',


In [123]:
top_kws

Unnamed: 0_level_0,submission_title
subreddit,Unnamed: 1_level_1
AdviceAnimals,"[gaming, gopher, trouble, horror, games, fuck,..."
WTF,"[this, picture, recruit, police, brutality, ab..."
fffffffuuuuuuuuuuuu,"[people, shitter, how, lose, weight, butter, e..."
funny,"[itt, thing, hate, like, start, the, businessm..."
pics,"[if, die, today, feel, satisfied, drunk, sovie..."
reddit.com,"[wife, bed, early, crapfuckshit, aaaaaaa, sfw,..."
wow,"[wife, bed, early, crapfuckshit, aaaaaaa, sfw,..."


In [15]:
import spacy
nlp = spacy.load('en')

doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

for token in doc:
    print(token, token.lemma, token.lemma_, token.is_stop)

Apples 8566208034543834098 apple False
and 2283656566040971221 and True
oranges 2208928596161743350 orange False
are 10382539506755952630 be True
similar 18166476740537071113 similar False
. 12646065887601541794 . False
Boots 9918665227421442029 boot False
and 2283656566040971221 and True
hippos 6542994350242320795 hippo False
are 10382539506755952630 be True
n't 447765159362469301 not False
. 12646065887601541794 . False


In [13]:
import en_core_web_sm
nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'en_core_web_sm'