In [1]:
subreddit_filename = 'archive/subreddit_info.csv'
main_dataset_filename = 'archive/rspct.tsv'

In [2]:
import re
import numpy as np
import pandas as pd
import scipy.stats
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rmritik/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
sub_df = pd.read_csv(subreddit_filename)

In [4]:
sub_df.tail()

Unnamed: 0,subreddit,category_1,category_2,category_3,in_data,reason_for_exclusion
3389,HFY,writing/stories,sci-fi,,True,
3390,TalesFromYourServer,writing/stories,tech support,,False,fewer posts than r/talesfromtechsupport which ...
3391,talesfromtechsupport,writing/stories,tech support,,True,
3392,WayfarersPub,writing/stories,wayfarers pub,,True,
3393,Glitch_in_the_Matrix,writing/stories,weird,,False,too_broad


In [5]:
in_data_rows = sub_df[sub_df['in_data'].isin([True])]
print('Number of subreddits in main dataset:', in_data_rows['subreddit'].count())

Number of subreddits in main dataset: 1013


In [6]:
print('Number of distinct subreddit genres/categories in main dataset:', in_data_rows['category_1'].nunique())

Number of distinct subreddit genres/categories in main dataset: 39


In [7]:
import csv
main_df = pd.read_csv(main_dataset_filename, sep='\t', quoting=csv.QUOTE_NONE)
main_df.shape

(1013000, 4)

In [8]:
main_df.head()
main_df.tail()

Unnamed: 0,id,subreddit,title,selftext
1012995,5r9k4h,MSLGame,Is this months rebirth and dungeon astro's wor...,I looking on what to evo3 farm next and was ex...
1012996,6529fp,CrohnsDisease,I might need a Medical leave from grad school,Has anyone here ever needed a medical leave fr...
1012997,7tiyzx,HongKong,Police harassing ethnic minorities in Hong Kong,I thought I'd make this post so that more peop...
1012998,664ha3,yorku,SU EECS 2030 and EECS 2021 - need advice,"Hi, I just finished 1st year EECS courses and ..."
1012999,6ump0y,wine,What is the worse wine you ever had?,My worst wine was at a dinner party. My friend...


In [9]:
import re
from collections import defaultdict

genres = list(in_data_rows['category_1'].unique())
genre_to_sub = dict()
for genre in genres:
  rows = in_data_rows.loc[in_data_rows['category_1'] == genre]
  genre_to_sub[genre] = list(rows['subreddit'])

# Get frequencies of each subreddit using efficient pandas filtering.
frequencies = defaultdict(int)
for genre in genres:
  frequencies[genre] = len(main_df.loc[main_df['subreddit'].isin(genre_to_sub[genre])])

frequencies_df = pd.DataFrame([frequencies])
# frequencies_df.transpose().hist()
frequencies_df = frequencies_df.T

print(scipy.stats.describe(list(frequencies.values())))
print(frequencies_df)

DescribeResult(nobs=39, minmax=(5000, 100000), mean=25974.358974358973, variance=390446693.65722007, skewness=1.8081815563285653, kurtosis=3.5418945416204455)
                            0
advice/question         18000
animals                 17000
anime/manga             26000
appearance              11000
arts                    21000
autos                   20000
board_game               9000
books                   12000
card_game               15000
company/website         28000
crypto                  29000
drugs                   23000
education               17000
electronics             51000
finance/money           10000
food/drink              15000
geo                     29000
hardware/tools          14000
health                  58000
hobby                   30000
meta                     9000
movies                   7000
music                   43000
other                   27000
parenting               13000
politics/viewpoint      16000
profession              56000
p

In [10]:
main_df['title'] = main_df['title'].apply(lambda title: title.lower())
main_df['selftext'] = main_df['selftext'].apply(lambda selftext: selftext.lower())

In [11]:
def clean_punctuation(sr):
  for i, text in sr.items():
    text = re.sub(r",", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"<.*>", "", text)
    sr.at[i] = text

clean_punctuation(main_df['title'])
clean_punctuation(main_df['selftext'])

In [12]:
print(main_df.head())
print(main_df.tail())

       id             subreddit  \
0  6d8knd  talesfromtechsupport   
1  58mbft               teenmom   
2  8f73s7                Harley   
3  6ti6re          ringdoorbell   
4  77sxto                 intel   

                                               title  \
0                remember your command line switches   
1              "so what was matt ""addicted"" to ? "   
2                                     no club colors   
3          not door bell but floodlight mount height   
4  worried about my 8700k small fft/data stress r...   

                                            selftext  
0                                       "hi there  "  
1  did he ever say what his addiction was or is h...  
2  funny story i went to college in las vegas thi...  
3  i know this is a sub for the 'ring doorbell' b...  
4  "prime95 (regardless of version) and occt both...  
             id      subreddit  \
1012995  5r9k4h        MSLGame   
1012996  6529fp  CrohnsDisease   
1012997  7tiyzx     

In [13]:
from tqdm import tqdm_notebook as tqdm
main_df['genre'] = np.nan
for genre in tqdm(genres):
  rows_for_genre = main_df.loc[main_df['subreddit'].isin(genre_to_sub[genre]), 'genre'] = genre

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for genre in tqdm(genres):


  0%|          | 0/39 [00:00<?, ?it/s]

In [14]:
print(main_df.head())
print(main_df.tail())

       id             subreddit  \
0  6d8knd  talesfromtechsupport   
1  58mbft               teenmom   
2  8f73s7                Harley   
3  6ti6re          ringdoorbell   
4  77sxto                 intel   

                                               title  \
0                remember your command line switches   
1              "so what was matt ""addicted"" to ? "   
2                                     no club colors   
3          not door bell but floodlight mount height   
4  worried about my 8700k small fft/data stress r...   

                                            selftext            genre  
0                                       "hi there  "  writing/stories  
1  did he ever say what his addiction was or is h...          tv_show  
2  funny story i went to college in las vegas thi...            autos  
3  i know this is a sub for the 'ring doorbell' b...   hardware/tools  
4  "prime95 (regardless of version) and occt both...      electronics  
             id     

In [15]:
main_df

Unnamed: 0,id,subreddit,title,selftext,genre
0,6d8knd,talesfromtechsupport,remember your command line switches,"""hi there """,writing/stories
1,58mbft,teenmom,"""so what was matt """"addicted"""" to ? """,did he ever say what his addiction was or is h...,tv_show
2,8f73s7,Harley,no club colors,funny story i went to college in las vegas thi...,autos
3,6ti6re,ringdoorbell,not door bell but floodlight mount height,i know this is a sub for the 'ring doorbell' b...,hardware/tools
4,77sxto,intel,worried about my 8700k small fft/data stress r...,"""prime95 (regardless of version) and occt both...",electronics
...,...,...,...,...,...
1012995,5r9k4h,MSLGame,is this months rebirth and dungeon astro's wor...,i looking on what to evo3 farm next and was ex...,video_game
1012996,6529fp,CrohnsDisease,i might need a medical leave from grad school,has anyone here ever needed a medical leave fr...,health
1012997,7tiyzx,HongKong,police harassing ethnic minorities in hong kong,i thought i'd make this post so that more peop...,geo
1012998,664ha3,yorku,su eecs 2030 and eecs 2021 - need advice,hi i just finished 1st year eecs courses and h...,education


In [16]:
# Load into a local colab file.
main_df_pickle_filename = 'main_df.pickle'
main_df.to_pickle(main_df_pickle_filename)