In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline
import re
from io import StringIO
from html.parser import HTMLParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



## Pandas Result Config

In [75]:
pd.set_option('display.max_colwidth', None)
# pd.reset_option('all')

## Read in Reddit Data

In [12]:
df = pd.read_csv("C:/Users/Tymon/Documents/UC Berkely School Work/Scratch/data/RS_2018-02.csv")

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,author,subreddit,selftext,title
0,0,CalligraBot,Calligraphy,# Word of the day: Astringent\n\nSynonyms: Sul...,"Word of the Day - January 31, 2018 - Astringent"
1,1,unethicaldecisions,gradadmissions,Looking at debt and risk/benefit of my current...,Applying to MA in Econ
2,2,00Noir,mysticmessenger,Hey there! Thought it would be fun to see what...,Favourite CGs?
3,3,Svart_Drage,LearningPhotoshop,"I did this for my own Youtube channel, sizes a...",My first try at Photoshop.
4,4,storky0613,sewing,"Imgur wonâ€™t let me post a picture right now, s...",Easiest way to sew a monogram to a blanket?


In [14]:
print(len(df))

3451272


## Data Cleaning and Preparation

#### Initial filters

In [15]:
# Filter out posts that have been removed
df = df.query('selftext != "[removed]"')
print(len(df))

2580069


In [16]:
# Filter out posts that have null content
df = df[df.selftext.notnull()]
print(len(df))

2580065


In [18]:
# try and filter out bots
df = df[~df.selftext.str.contains("bot")]
print(len(df))

2365932


#### Strip title and content of html and formatting characters

In [20]:
# strip content of html tags, html entities, and formatting characters

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def strip(body):
    cleaned = ''.join(s for s in strip_tags(body) if ord(s)>31 and ord(s)<126).replace('&nbsp;', ' ')
    return cleaned

df['selftext_cleaned'] = df.apply(lambda row : strip(row['selftext']), axis = 1)

In [23]:
# strip title of html tags, html entities, and formatting characters
df['title_cleaned'] = df.apply(lambda row : strip(row['title']), axis = 1)

In [30]:
df.query('author == "unethicaldecisions" & subreddit == "gradadmissions"')

Unnamed: 0.1,Unnamed: 0,author,subreddit,selftext,title,selftext_cleaned,title_cleaned
1,1,unethicaldecisions,gradadmissions,"Looking at debt and risk/benefit of my current degree, I'm thinking of taking my Biochemistry degree to my hometown state school for Econ. I'm good at math-better than memorizing or cramming. I have a 2.8 GPA, but A's in some hard chemistry courses and a B in Calculus. Looking at taking Cal 2 in summer, macroeconomics and Cal 3 in fall, and then going in the spring time since I'm getting into my late twenties. No business job experience. Math tutor experience. I think fast in math. I'm looking toward an analytical/consulting job with potential to climb up the latter immediately after graduation. \n\nHas anyone got in with a sub 3.0 GPA?",Applying to MA in Econ,"Looking at debt and risk/benefit of my current degree, I'm thinking of taking my Biochemistry degree to my hometown state school for Econ. I'm good at math-better than memorizing or cramming. I have a 2.8 GPA, but A's in some hard chemistry courses and a B in Calculus. Looking at taking Cal 2 in summer, macroeconomics and Cal 3 in fall, and then going in the spring time since I'm getting into my late twenties. No business job experience. Math tutor experience. I think fast in math. I'm looking toward an analytical/consulting job with potential to climb up the latter immediately after graduation. Has anyone got in with a sub 3.0 GPA?",Applying to MA in Econ


#### Filter out irrelevant examples based on content and title word counts

In [34]:
# add title_cleaned word count

title_length = []
for i, title in enumerate(df['title_cleaned']):
    if type(title) != type(str(title)):
        title_length.append(0)
    else:
        title_split = title.split()
        title_length.append(len(title_split))
        
df['title_length'] = title_length

In [36]:
# add selftext_cleaned word count

content_length = []
for i, content in enumerate(df['selftext_cleaned']):
    if type(content) != type(str(content)):
        content_length.append(0)
    else:
        content_split = content.split()
        content_length.append(len(content_split))
        
df['content_length'] = content_length

In [46]:
# filter out 0 counts for title lenth or content length
df = df.query('title_length != 0 & content_length != 0')
print(len(df))

2361857


In [56]:
# filter out content that is less than 9 words in length
df = df.query('content_length > 10')
print(len(df))

1977660


In [58]:
df['author'].value_counts()

removalbot             25278
AutoModerator          23976
[deleted]              15400
censorship_notifier    11890
fiplefip               11657
                       ...  
Throwawaypanic132          1
trillion-8                 1
Dmadtitan                  1
itisjustimpossible         1
QualityLeaf                1
Name: author, Length: 983580, dtype: int64

#### Add cosine similarty scores

In [53]:
def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

def cosine_sim(x):
    ''' Returns cosine similarity for row in training data'''
    
    data = [x.title, x.selftext]
    
    try:
        vector_matrix = TfidfVectorizer().fit_transform(data)
    
    except:
        return 'check'
    
    return cosine_similarity(vector_matrix)[0,1]

In [54]:
data = [df.iloc[0].title_cleaned, df.iloc[0].selftext_cleaned]

Tfidf_vect = TfidfVectorizer()
vector_matrix = Tfidf_vect.fit_transform(data)

tokens = Tfidf_vect.get_feature_names()
create_dataframe(vector_matrix.toarray(),tokens)

cosine_similarity_matrix = cosine_similarity(vector_matrix)
create_dataframe(cosine_similarity_matrix, ['title_cleaned', 'selftext_cleaned'])

df['cosine_sim'] = df.apply(cosine_sim, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine_sim'] = df.apply(cosine_sim, axis=1)


In [62]:
# remove check values in cosine sim column and convert to float datatype

df = df.query('cosine_sim != "check"')
df["cosine_sim"] = df["cosine_sim"].astype(float)
print(len(df))

1977636


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cosine_sim"] = df["cosine_sim"].astype(float)


In [69]:
df = df.rename(columns={'Unnamed: 0': 'index'})

Unnamed: 0,index,author,subreddit,selftext,title,selftext_cleaned,title_cleaned,title_length,content_length,cosine_sim
1,1,unethicaldecisions,gradadmissions,Looking at debt and risk/benefit of my current...,Applying to MA in Econ,Looking at debt and risk/benefit of my current...,Applying to MA in Econ,5,112,0.202659
2,2,00Noir,mysticmessenger,Hey there! Thought it would be fun to see what...,Favourite CGs?,Hey there! Thought it would be fun to see what...,Favourite CGs?,2,38,0.014471
3,3,Svart_Drage,LearningPhotoshop,"I did this for my own Youtube channel, sizes a...",My first try at Photoshop.,"I did this for my own Youtube channel, sizes a...",My first try at Photoshop.,5,40,0.032302
4,4,storky0613,sewing,"Imgur wonâ€™t let me post a picture right now, s...",Easiest way to sew a monogram to a blanket?,"Imgur wont let me post a picture right now, so...",Easiest way to sew a monogram to a blanket?,9,167,0.220658
6,6,ranalog,analog,This thread is for you to promote your blog / ...,Monthly 'Self Promotion' - February,This thread is for you to promote your blog / ...,Monthly 'Self Promotion' - February,5,54,0.026446
...,...,...,...,...,...,...,...,...,...,...
3451264,3451264,GeoffreyYeung,paydaytheheist,It's hard to find the mask you just got and wa...,"Why can't we search in mask selection, when th...",It's hard to find the mask you just got and wa...,"Why can't we search in mask selection, when th...",15,17,0.056210
3451266,3451266,KhmerBoiMinji,talesoftherays,"Havenâ€™t played since last September, is it wor...",Should I get back into this,"Havent played since last September, is it wort...",Should I get back into this,6,20,0.000000
3451267,3451267,JackDalgren,smallbusiness,There was a thread here that was talking about...,There was a discussion about a book recently.....,There was a thread here that was talking about...,There was a discussion about a book recently.....,14,43,0.322088
3451268,3451268,senpizzle,buildapc,"Hey, all! With FFXV (93 GB) coming out soon, I...",Storage Upgrade- RAID or New Letter?,"Hey, all! With FFXV (93 GB) coming out soon, I...",Storage Upgrade- RAID or New Letter?,6,64,0.168159


In [95]:
df.query('cosine_sim < 0.1 & title_length > 2 & index > 50')

Unnamed: 0,index,author,subreddit,selftext,title,selftext_cleaned,title_cleaned,title_length,content_length,cosine_sim
54,54,Irulantk,HPfanfiction,"I usually try to write them up just to get them out of my head, but never share them. Though I really don't want to write anything but focus on my story at the moment. Currently I'm imagining Fawkes being owned but some semi-friendly wizard hermit, then poofing to Ollivander to give him two more feathers and Ollivander saying 'oh god not again.'",What do you do with your useless plot bunnies?,"I usually try to write them up just to get them out of my head, but never share them. Though I really don't want to write anything but focus on my story at the moment. Currently I'm imagining Fawkes being owned but some semi-friendly wizard hermit, then poofing to Ollivander to give him two more feathers and Ollivander saying 'oh god not again.'",What do you do with your useless plot bunnies?,9,63,0.000000
55,55,joeignition,FocusST,"Anyone else bend one of these before? Caught a nasty pothole (thanks, Baltimore) and ended up with an undamaged tire, but a bent rim.\n\nAre they delicate snowflakes or was it just that brutal? In my 32 years of driving, haven't had this happen before.",Snowflake rims fragile?,"Anyone else bend one of these before? Caught a nasty pothole (thanks, Baltimore) and ended up with an undamaged tire, but a bent rim.Are they delicate snowflakes or was it just that brutal? In my 32 years of driving, haven't had this happen before.",Snowflake rims fragile?,3,44,0.000000
59,59,MassieJako,EA_NHL,"Edit* NHL, not football ðŸ˜„\n\nI need someone to co-op with me to get Carey Price. Can someone write their gamertag for me to add? \n\nXbox One. \n\nItâ€™ll only take a few minutes of your time to help out a Danish hockey fan :)\n\nGreatly appreciated",A helper for NFL Threes?,"Edit* NHL, not football I need someone to co-op with me to get Carey Price. Can someone write their gamertag for me to add? Xbox One. Itll only take a few minutes of your time to help out a Danish hockey fan :)Greatly appreciated",A helper for NFL Threes?,5,44,0.035346
64,64,durgadurgadurg,skiutah,"Hey, my friends and I are planning to spend a week in Utah for the second week of March. We usually rent a house and stay a week in Eden but the snow report from Powmow looks really thin. Would staying near Little/Big Cottonwood be a better bet for snow? Thanks!!","Coming to Utah mid-March, Alta/Snowbird or PowMow?","Hey, my friends and I are planning to spend a week in Utah for the second week of March. We usually rent a house and stay a week in Eden but the snow report from Powmow looks really thin. Would staying near Little/Big Cottonwood be a better bet for snow? Thanks!!","Coming to Utah mid-March, Alta/Snowbird or PowMow?",7,51,0.097801
65,65,stormshieldonebot,FortNiteBR,**Weekly Cosmetics**\n\n* [Funk Ops](https://stormshield.one/images/items/cid_038_athena_commando_m_disco.png) @ 1500v\n\n* [Disco Brawl](https://stormshield.one/images/items/pickaxe_id_016_disco.png) @ 1500v\n\n**Daily Cosmetics**\n\n* [Snow Squall](https://stormshield.one/images/items/glider_id_006_wintercamo.png) @ 500v\n\n* [Survival Specialist](https://stormshield.one/images/items/cid_027_athena_commando_f.png) @ 1200v\n\n* [Fresh](https://stormshield.one/images/items/eid_fresh.png) @ 800v\n\n* [Ice Breaker](https://stormshield.one/images/items/pickaxe_id_014_wintercamo.png) @ 500v\n\n* [Infiltrator](https://stormshield.one/images/items/cid_019_athena_commando_m.png) @ 1200v\n\n* [Roadtrip](https://stormshield.one/images/items/glider_roadtrip.png) @ 500v\n\nSee all of today's appearances on [Storm Shield One](https://stormshield.one/pvp/sales),Daily Cosmetic Sales (01 Feb),**Weekly Cosmetics*** [Funk Ops](https://stormshield.one/images/items/cid_038_athena_commando_m_disco.png) @ 1500v* [Disco Brawl](https://stormshield.one/images/items/pickaxe_id_016_disco.png) @ 1500v**Daily Cosmetics*** [Snow Squall](https://stormshield.one/images/items/glider_id_006_wintercamo.png) @ 500v* [Survival Specialist](https://stormshield.one/images/items/cid_027_athena_commando_f.png) @ 1200v* [Fresh](https://stormshield.one/images/items/eid_fresh.png) @ 800v* [Ice Breaker](https://stormshield.one/images/items/pickaxe_id_014_wintercamo.png) @ 500v* [Infiltrator](https://stormshield.one/images/items/cid_019_athena_commando_m.png) @ 1200v* [Roadtrip](https://stormshield.one/images/items/glider_roadtrip.png) @ 500vSee all of today's appearances on [Storm Shield One](https://stormshield.one/pvp/sales),Daily Cosmetic Sales (01 Feb),5,40,0.022426
...,...,...,...,...,...,...,...,...,...,...
3451252,3451252,stoogesfan1,CountryOfReddit,"Any HoR member may submit a bill and/or amendment for voting on a simple majority decision if the VP is absent for an unnecessarily long period of time more than or equal to a 24 hour period.\n\n**The vote will now commence. Say ""Aye"" to indicate affirmative or ""Nay"" to indicate negative. You must express your reason for either in detail accompanying your vote. If you wish to abstain, you need not respond, but must state your reason for abstention if you do state your abstention.**",[Proposed Amendment] Government Efficiency Amendment,"Any HoR member may submit a bill and/or amendment for voting on a simple majority decision if the VP is absent for an unnecessarily long period of time more than or equal to a 24 hour period.**The vote will now commence. Say ""Aye"" to indicate affirmative or ""Nay"" to indicate negative. You must express your reason for either in detail accompanying your vote. If you wish to abstain, you need not respond, but must state your reason for abstention if you do state your abstention.**",[Proposed Amendment] Government Efficiency Amendment,5,85,0.035763
3451258,3451258,thankyoudaletech,Cumtown,"A gay man who is very strong and fast, knows several martial arts, expert marksman, infected with HIV",The most dangerous gay,"A gay man who is very strong and fast, knows several martial arts, expert marksman, infected with HIV",The most dangerous gay,4,18,0.066545
3451264,3451264,GeoffreyYeung,paydaytheheist,It's hard to find the mask you just got and want to check out. Overkill plz fix.,"Why can't we search in mask selection, when the function's already there in achievement selection?",It's hard to find the mask you just got and want to check out. Overkill plz fix.,"Why can't we search in mask selection, when the function's already there in achievement selection?",15,17,0.056210
3451266,3451266,KhmerBoiMinji,talesoftherays,"Havenâ€™t played since last September, is it worth trying to catch up with everything and after all the missed events?\n",Should I get back into this,"Havent played since last September, is it worth trying to catch up with everything and after all the missed events?",Should I get back into this,6,20,0.000000


In [48]:
# filter out content that is smaller than 150 words in length
# df = df.query('content_length >= 150')
# print(len(df))

Unnamed: 0.1,Unnamed: 0,author,subreddit,selftext,title,selftext_cleaned,title_cleaned,title_length,content_length
4,4,storky0613,sewing,"Imgur wonâ€™t let me post a picture right now, s...",Easiest way to sew a monogram to a blanket?,"Imgur wont let me post a picture right now, so...",Easiest way to sew a monogram to a blanket?,9,167
20,20,[deleted],dirtypenpals,Hey so I've been really enjoying the different...,F4M - MILF Celeb Slut,Hey so I've been really enjoying the different...,F4M - MILF Celeb Slut,5,238
26,26,autobuzzfeedbot,buzzfeedbot,1. Place a wooden spoon across the pot when co...,17 Cooking Tips Every One Should Know,1. Place a wooden spoon across the pot when co...,17 Cooking Tips Every One Should Know,7,216
28,28,_Horus_Lupercal_,Animesuggest,Sorry if that's not enough but basically I wat...,Watched it at least 12+ years ago had a scene ...,Sorry if that's not enough but basically I wat...,Watched it at least 12+ years ago had a scene ...,23,225
61,61,ThatHDNyman,whatsthatbook,There was a book I read while in school at som...,Fantasy book that had a cloned character with ...,There was a book I read while in school at som...,Fantasy book that had a cloned character with ...,18,212
...,...,...,...,...,...,...,...,...,...
3451233,3451233,masteroftehninja,RocketLeague,"As the title says, I've been experiencing some...",Rocket League Game Problems (Freezing and Majo...,"As the title says, I've been experiencing some...",Rocket League Game Problems (Freezing and Majo...,9,212
3451236,3451236,Shandog,AusLegal,Hello. I'm a renter and on Sunday a stranger ...,$200 noise complaint strata law - is this legal?,Hello. I'm a renter and on Sunday a stranger ...,$200 noise complaint strata law - is this legal?,9,218
3451237,3451237,skywalkinondeezhatrz,starwarsspeculation,I'm sure you're all sick of the Plagueis/Snoke...,Hux may have saved Snoke in TLJ...,I'm sure you're all sick of the Plagueis/Snoke...,Hux may have saved Snoke in TLJ...,7,293
3451238,3451238,dpbmac,malefashionadvice,"Hi /r/malefashionadvice,\n\n&amp;nbsp;\n\nI ju...",Cheap Briefcases,"Hi /r/malefashionadvice, I just got a position...",Cheap Briefcases,2,164
