In [1]:
import os
import pandas as pd

csv_list = os.listdir('reddit-csv-files')
os.chdir('reddit-csv-files')

In [2]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    """Tokenizes text"""
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [3]:
# Append all CSV files to one DataFrame

data = pd.DataFrame()
name = []
titles = []

for x in csv_list:
    if x[-3:] == 'csv':
        df = pd.read_csv(x)
        name.append(x[:-4])
        temp = ''
        for title in df['title']:
            temp = temp + title
        titles.append(temp)
            
data['name'] = name
data['post_title'] = titles

In [4]:
# Tokenize text in title column for each sub reddit
data.post_title = data.post_title.apply(tokenize)

In [5]:
# Show data frame
data.head()

Unnamed: 0,name,post_title
0,DeepIntoYouTube,"[took, english, subtitles, pirated, chinese, c..."
1,Baking,"[boyfriend, forgot, th, birthday, spent, day, ..."
2,mealtimevideos,"[fox, news, anchor, tucker, carlson, loses, un..."
3,vinyl,"[humble, setup, like, records, freedom, help, ..."
4,graphic_design,"[guy, deserves, exposure, business, cardcreati..."


In [6]:
data = data.reset_index()

In [7]:
data.rename(columns={('index'):('target')}, inplace=True)

In [8]:
data.head()

Unnamed: 0,target,name,post_title
0,0,DeepIntoYouTube,"[took, english, subtitles, pirated, chinese, c..."
1,1,Baking,"[boyfriend, forgot, th, birthday, spent, day, ..."
2,2,mealtimevideos,"[fox, news, anchor, tucker, carlson, loses, un..."
3,3,vinyl,"[humble, setup, like, records, freedom, help, ..."
4,4,graphic_design,"[guy, deserves, exposure, business, cardcreati..."


In [9]:
# Add target column so each subreddit can be classified
data['target'] = data.index + 1

In [10]:
data.head()

Unnamed: 0,target,name,post_title
0,1,DeepIntoYouTube,"[took, english, subtitles, pirated, chinese, c..."
1,2,Baking,"[boyfriend, forgot, th, birthday, spent, day, ..."
2,3,mealtimevideos,"[fox, news, anchor, tucker, carlson, loses, un..."
3,4,vinyl,"[humble, setup, like, records, freedom, help, ..."
4,5,graphic_design,"[guy, deserves, exposure, business, cardcreati..."


In [11]:
# Analyzing tokens
from collections import Counter

In [12]:
# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

In [None]:
# Update it based on a split of each of our documents
data['post_title'].apply(lambda x: word_counts.update(x))

In [14]:
# Print out the 10 most common words
word_counts.most_common(10)

[('like', 26699),
 ('new', 20525),
 ('time', 17902),
 ('got', 14167),
 ('people', 13875),
 ('day', 13868),
 ('post', 13522),
 ('years', 12523),
 ('year', 12447),
 ('oc', 11890)]

In [15]:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [16]:
wc = count(data['post_title'])

In [17]:
wc.head()

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
800,like,975,26699,1.0,0.005701,0.005701,0.981873
1909,new,967,20525,2.0,0.004382,0.010083,0.973817
3014,time,970,17902,3.0,0.003822,0.013906,0.976838
44,got,957,14167,4.0,0.003025,0.01693,0.963746
2446,people,884,13875,5.0,0.002963,0.019893,0.890232


In [18]:
data.head()

Unnamed: 0,target,name,post_title
0,1,DeepIntoYouTube,"[took, english, subtitles, pirated, chinese, c..."
1,2,Baking,"[boyfriend, forgot, th, birthday, spent, day, ..."
2,3,mealtimevideos,"[fox, news, anchor, tucker, carlson, loses, un..."
3,4,vinyl,"[humble, setup, like, records, freedom, help, ..."
4,5,graphic_design,"[guy, deserves, exposure, business, cardcreati..."


In [19]:
X = data['post_title'].values
y = data['name'].values

In [21]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=42)