In [120]:
import pandas as pd

subreddit_file = open('archive/50_subreddits_list.csv', mode='r', encoding='utf-8-sig')
subreddits = {}
for line in subreddit_file.readlines():
  subreddit = line.rstrip("\n").lower()
  df = pd.read_csv(f'archive/{subreddit}.csv')
  subreddits[subreddit] = df

### Get Title and Text from DataFrames
Our classifier only works based on Title and Text

In [22]:
subreddit_bodies = {}
for subreddit in subreddits:
  df = subreddits[subreddit]
  df['title'] = df['title'].fillna('')
  df['body'] = df['body'].fillna('')
  subreddit_bodies[subreddit] = df['title'] + " " + df['body'] 

subreddit_bodies['anime']

0      'Dragon Ball' Creator Akira Toryiyama Has Pass...
1        Kaguya-sama: Love Is War - Season 3 announced! 
2                         Aqua in yoga pants | Konosuba 
3                     This is not a Cigarette [Gintama] 
4         The Devil is a Part-Timer Season 2 Announced! 
                             ...                        
991    Mob Psycho 100 Season 2 - Episode 5 discussion...
992    Never thought in a million years I’d come acro...
993    I seriously hate Sundays [Engaged to the Unide...
994                "Spy Classroom" New Character Visual 
995    Hayasaka Ai in Spy Suit from "Kaguya: Love is ...
Length: 996, dtype: object

### Remove Links
Our dataset includes links of the form "\[link text](link)". Since we want to analayze the text content of the subreddits, we should filter out these links and just keep "link text"

In [14]:
import re # Regex Library

no_links = {}
for subreddit in subreddit_bodies:
  data = subreddit_bodies[subreddit]
  no_links[subreddit] = data.map(lambda txt: re.sub("\[([^\]]*)\]\(([^\)]*)\)", " \g<1> ", txt))

no_links['travel']

0      I visited North Korea recently, these are some...
1      Taken with a phone out of my hotel window in V...
2      Taking a ride on the Bernina Express through t...
3      Wife and I hate big social events and love tra...
4      The exact moment I took a step too close to th...
                             ...                        
992    Sisteron- France. Beautiful place we had a cof...
993    Croatia, probably the most beautiful country i...
994    Michelangelo's David is great, but pieta is on...
995    If you don't mind a little dust and grit and y...
996    I’d never realized how beautiful Montenegro wa...
Length: 997, dtype: object

### Tokenizing

Since reddit posts contain long sections of prose that are unique to each user's writing we felt finding attributes about the text as a whole would be difficult. Rather, we tokenize each head and body in hopes that individual words will vary between subreddits. We also turned each word to lowercase to remove issues with capitalization between different posts, since capitalization likely doesn't affect the semantic meaning of each word in a post.

Additionally, we found the special unicode character ’ (as opposed to ') in several entries across datasets likely due to differences in keyboards among different languages. We replaced the former with the latter to correctly match words with apostrophes (e.g. I’d => I'd)

In [20]:
import nltk

tokenizer = nltk.RegexpTokenizer(
  pattern=r"[\w']+", # Only match words as tokens (coarsely, \w + apostrophes)
  gaps=False,
  discard_empty=True # Remove empty tokens caused by markdown content
)
tokenized_subreddits = {}
for subreddit in no_links:
  data = no_links[subreddit]
  data = data.map(lambda txt: txt.replace("’", "'").lower())
  tokenized_subreddits[subreddit] = data.map(lambda txt: tokenizer.tokenize(txt)) 

tokenized_subreddits['history']

0      [new, discovery, mode, turns, video, game, ass...
1      [we, are, not, here, to, help, you, with, your...
2      [a, 1776, excerpt, from, john, adam's, diary, ...
3      [famous, viking, warrior, burial, revealed, to...
4      [3, 000, year, old, underwater, castle, discov...
                             ...                        
987    [dna, study, has, now, provided, support, for,...
988    [stonehenge, megalith, came, from, scotland, n...
989    [french, resistance, man, breaks, silence, ove...
990    [holy, grail, of, shipwrecks', to, be, raised,...
991    [emily, wilson's, new, translation, of, the, i...
Length: 992, dtype: object

In [34]:
# bring tokens back together as data
subreddit_classes = {}
subreddit_df = pd.DataFrame(columns=['text', 'subreddit'])

next_class = 1
for subreddit in tokenized_subreddits:
  subreddit_classes[subreddit] = next_class
  data = pd.DataFrame()
  data['text'] = tokenized_subreddits[subreddit].map(lambda arr: ' '.join(arr))
  data['subreddit'] = next_class
  subreddit_df = pd.concat([subreddit_df, data], ignore_index=True)
  next_class += 1

subreddit_df

Unnamed: 0,text,subreddit
0,my cab driver tonight was so excited to share ...,1
1,guardians of the front page,1
2,gas station worker takes precautionary measure...,1
3,the conversation my son and i will have on chr...,1
4,the denver broncos have the entire town of sou...,1
...,...,...
49261,how come no one has invented a foot pedal for ...,50
49262,why are trans people talked about so much desp...,50
49263,did your penis ever fall asleep like your legs...,50
49264,someone stole my bike i tracked it to its loca...,50


In [58]:
from sklearn.model_selection import train_test_split

X = subreddit_df['text']
y = subreddit_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [59]:
y_train.value_counts()

subreddit
36    769
1     769
35    766
3     765
6     759
14    759
41    758
39    756
26    754
34    754
8     752
43    750
21    750
5     750
12    750
44    749
16    749
22    748
2     748
25    746
17    746
47    744
30    743
48    741
20    740
33    739
32    737
19    737
24    736
38    736
18    736
15    736
46    735
4     735
7     735
9     733
50    733
13    733
11    733
49    731
31    728
42    727
27    726
37    718
23    717
10    711
28    708
29    706
40    699
45    669
Name: count, dtype: int64

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

reddit_classifer = Pipeline([
  ('cv', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('sgd', SGDClassifier()),
])

reddit_classifer.fit(X_train, y_train)

In [65]:
import numpy as np

y_pred = reddit_classifer.predict(X_test)
np.mean(y_pred == y_test)

0.6011204026954615

In [None]:
def predictPrompt(prompt):
  predicted_class = reddit_classifer.predict([prompt])[0].item()
  for subreddit in subreddit_classes:
    if subreddit_classes[subreddit] == predicted_class:
      return subreddit
  print("No applicable subreddit (should be unreachable)")

predictPrompt("My gf just broke up with me. What should I do?")