### Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import gensim
import gensim.models.doc2vec as doc2vec
from gensim.models.doc2vec import TaggedDocument
import re

### Load data & clean it up

In [2]:
df = pd.read_csv('/Users/ksmith/Documents/Code/DS1/Unit4_Project/build-post-here-DS/plz_work.csv')
df = df.fillna('')
df['all_text'] = df['title'] + ' ' + df['text'] # Combining text & titles to one field

df.head(10)

Unnamed: 0.1,Unnamed: 0,submission_id,title,text,subreddit_id,name,all_text
0,0,bcy7ky,15 minutes wait to play more?,"So I dodged a game, all my mains banned or pic...",2rfxx,leagueoflegends,15 minutes wait to play more? So I dodged a ga...
1,1,bcy6mf,League has to investigate possible fake game b...,Obvious inting in game 5 right before TSM win ...,2rfxx,leagueoflegends,League has to investigate possible fake game b...
2,2,bcz8v5,"Xmithie...the Goat, I told you so",Here's what I said about Xmithie years ago...\...,2rfxx,leagueoflegends,"Xmithie...the Goat, I told you so Here's what ..."
3,3,bcz9r2,Whatever happened to that Teacher in Korea who...,Anyone remember this ? It was way back like 20...,2rfxx,leagueoflegends,Whatever happened to that Teacher in Korea who...
4,4,bcz9wj,Just got to diamond playing annie only :),Just wanted to share my excitement with you gu...,2rfxx,leagueoflegends,Just got to diamond playing annie only :) Just...
5,5,bcza0d,Hot Take: TSM vs TL is the Best Finals in LCS ...,"As a TSM fan it sucks to lose, but those games...",2rfxx,leagueoflegends,Hot Take: TSM vs TL is the Best Finals in LCS ...
6,6,bczb3x,Doublelift never meet Faker before in an offic...,"As the title, I just realized somehow Doubleli...",2rfxx,leagueoflegends,Doublelift never meet Faker before in an offic...
7,7,bczbqf,Griffin went balls to the walls to make LS loo...,after the whole LS Vs Reddit vs other casters ...,2rfxx,leagueoflegends,Griffin went balls to the walls to make LS loo...
8,8,bczbu3,Dodged ranked game due to client bug :),&#x200B;\r\r\n\r\r\nhttps://i.redd.it/fj8xph83...,2rfxx,leagueoflegends,Dodged ranked game due to client bug :) &#x200...
9,9,bczdhn,Decided to re-roll the skin shards I had saved...,https://imgur.com/Ezbocp6\r\r\n\r\r\nhttps://i...,2rfxx,leagueoflegends,Decided to re-roll the skin shards I had saved...


In [5]:
top_20 = ['AskReddit', 'dankmemes', 'memes', 'teenagers', 'aww', 'RocketLeagueExchange', 'Showerthoughts',
          'funny', 'me_irl', 'freefolk', 'gameofthrones', 'pics', 'NoStupidQuestions', 'AskOuija',
          'unpopularopinion', 'gaming', 'videos', 'politics', 'AmItheAsshole', 'Jokes']

data = df[df['name'].isin(top_20)]
data = data.drop(data.columns[0], axis=1) # Drop old row index
data.head()

Unnamed: 0,submission_id,title,text,subreddit_id,name,all_text
551,bcy7k8,[Spoilers] What is that one conspiracy theory ...,Arya is already dead and is actually Jaqen H'g...,2rjz2,gameofthrones,[Spoilers] What is that one conspiracy theory ...
552,bcz9cw,[SPOILERS] The only reunion that I want to see...,,2rjz2,gameofthrones,[SPOILERS] The only reunion that I want to see...
553,bczaow,[No Spoilers] How does rising the dead work?,Some stuff I wonder about.\r\r\n\r\r\nHow fres...,2rjz2,gameofthrones,[No Spoilers] How does rising the dead work? S...
554,bczgts,[No Spoilers] House Targaryen sigil cross stitch,,2rjz2,gameofthrones,[No Spoilers] House Targaryen sigil cross stitch
555,bczhln,[No Spoilers] Need help!!,My exams(very important) are going on and will...,2rjz2,gameofthrones,[No Spoilers] Need help!! My exams(very import...


In [6]:
import re
from urllib.parse import urlparse

def url_replace(string):
    return re.sub('http\S+|www.\S+', lambda match: urlparse(match.group()).hostname, string)

data['all_text'] = data['all_text'].apply(url_replace)

In [7]:
import string

table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

data['all_text'] = data['all_text'].str.lower()  #Text is lowercase
data['all_text'] = data['all_text'].str.replace('\r','')
data['all_text'] = data['all_text'].str.replace('\n','')
data['all_text'] = data['all_text'].str.replace('/',' ')
data['all_text'] = data['all_text'].str.replace('  ',' ')
data['all_text'] = data['all_text'].str.replace('www','')
data['all_text'] = data['all_text'].str.replace('com',' ')
data['all_text'] = data['all_text'].str.translate(table) #Remove punctuation

In [8]:
data['all_text']

551        spoilers  what is that one conspiracy theory ...
552        spoilers  the only reunion that i want to see...
553        no spoilers  how does rising the dead work  s...
554        no spoilers  house targaryen sigil cross stitch 
555        no spoilers  need help   my exams very import...
556        no spoilers  bookmaker in australia is offeri...
557        spoilers  game of thrones season 8    first d...
558        spoilers  is the nk similarly vulnerable   i ...
559        no spoilers  game of thrones season 8 will gi...
560        no spoilers  are you going to watch the premi...
561        no spoilers  any links to share on where to w...
562        spoilers  el paso meteorologist includes west...
563        no spoilers  got  hide that spoiler app   fru...
564        no spoilers  question about the book series h...
565        no spoilers  live stream of a huge invite onl...
566        no spoilers  got baby shark mashup everything...
567        spoilers  house frakenfurter 

### Doc2Vec & Logistic Regression

In [9]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [10]:
# y_encode = {'AskReddit':1, 'dankmemes':2, 'memes':3, 'teenagers':4, 'aww':5, 'RocketLeagueExchange':6, 
#             'Showerthoughts':7,'funny':8, 'me_irl':9, 'freefolk':10, 'gameofthrones':11, 'pics':12, 
#             'NoStupidQuestions':13, 'AskOuija':14, 'unpopularopinion':15, 'gaming':16, 'videos':17, 
#             'politics':18, 'AmItheAsshole':19, 'Jokes':20}

X = data['all_text']
y = data['name']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [12]:
all_data[:10]

[TaggedDocument(words=['what', 's', 'the', 'most', 'embarrassing', 'thing', 'you', 've', 'done', 'in', 'front', 'of', 'a', 'crush'], tags=['Train_0']),
 TaggedDocument(words=['no', 'spoilers', 'what', 'will', 'you', 'do', 'for', 'the', 'throne', 'you', 'can', 'see', 'my', 'illustration', 'of', 'daenerys', 'there', 'for', 'a', 'split', 'second', '0', '30', 'and', 'other', 'amazing', 'creations'], tags=['Train_1']),
 TaggedDocument(words=['have', '10x', 'bigger', 'balls'], tags=['Train_2']),
 TaggedDocument(words=['is', 'it', 'better', 'to', 'overeat', 'healthy', 'food', 'or', 'undereat', 'unhealthy', 'food', 'i', 'looked', 'it', 'up', 'but', 'didn’t', 'get', 'much', 'of', 'a', 'definitive', 'answer', 'i', 'know', 'overeating', 'isn’t', 'really', 'good', 'for', 'you', 'and', 'neither', 'is', 'undereating', 'so', 'i', 'really', 'don’t', 'know'], tags=['Train_3']),
 TaggedDocument(words=['daario', 'post', 'malone', 'naharis'], tags=['Train_4']),
 TaggedDocument(words=['what’s', 'the', 'bes

In [13]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=2, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 87990/87990 [00:00<00:00, 3136451.25it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3681878.86it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3240182.69it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3352654.08it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3509378.86it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3345117.78it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3464573.93it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3671147.72it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3352532.26it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3235382.17it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3497306.91it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3522542.80it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3211507.51it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3351618.87it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3263967.53it/s]
100%|██████████| 87990/87990 [00:00<00:00, 3365986.05it/s]
100%|██████████| 87990/87990 [00:00<00:00, 2967243.21it/

In [14]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [15]:
subreddits = data.name.unique()

In [16]:
%%time

logreg = LogisticRegression(n_jobs=10, solver='lbfgs', multi_class='multinomial')
logreg = logreg.fit(train_vectors_dbow, y_train)

CPU times: user 444 ms, sys: 118 ms, total: 562 ms
Wall time: 23.5 s


In [17]:
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=subreddits))

accuracy 0.6345796870856537
                      precision    recall  f1-score   support

       gameofthrones       0.89      0.92      0.90       733
           teenagers       0.35      0.16      0.22       755
           AskReddit       0.80      0.92      0.86      5579
                pics       0.57      0.51      0.54      1494
           dankmemes       0.60      0.59      0.60      1679
RocketLeagueExchange       0.99      0.99      0.99       999
                 aww       0.49      0.42      0.45       889
              me_irl       0.50      0.61      0.55      1214
               funny       0.26      0.11      0.15       771
      Showerthoughts       0.49      0.49      0.49       958
               memes       0.25      0.34      0.29      1448
    unpopularopinion       0.73      0.76      0.74       910
       AmItheAsshole       0.51      0.52      0.51      1253
              videos       0.81      0.95      0.87      1031
            freefolk       0.24      0.05

### User input test prediction

In [18]:
input_title = "no spoilers  how does rising the dead work"
input_text = "some stuff i wonder about how fresh do the dead need to be  can the nk rise people ho died weeks months years ago and how close"
all_input = [input_title + ' ' + input_text]
user_input = model_dbow.infer_vector(all_input, steps=30)
user_input = user_input.reshape(1, -1)
prediction = logreg.predict(user_input)
print(prediction)

['me_irl']


In [None]:
# ivd = {v: k for k, v in y_encode.items()}
# [ivd[x] for x in prediction]

In [19]:
# output_df = 
pd.DataFrame(logreg.predict_proba(user_input), columns=logreg.classes_).T.nlargest
# top_5 = output_df.nlargest(5, [0])
# top_5.reset_index().values

<bound method DataFrame.nlargest of                                  0
AmItheAsshole         9.852817e-06
AskOuija              2.234648e-03
AskReddit             7.556596e-06
Jokes                 1.777611e-06
NoStupidQuestions     4.047150e-08
RocketLeagueExchange  2.227789e-07
Showerthoughts        7.535379e-06
aww                   1.945973e-02
dankmemes             2.664732e-02
freefolk              4.265949e-04
funny                 2.276930e-02
gameofthrones         7.659893e-07
gaming                7.855380e-04
me_irl                9.007786e-01
memes                 2.259229e-02
pics                  3.709937e-03
politics              1.645872e-05
teenagers             5.415296e-06
unpopularopinion      7.220588e-12
videos                5.464293e-04>

In [None]:
output = pd.DataFrame(logreg.predict_proba(user_input), columns=logreg.classes_).T.nlargest(5, [0]).reset_index().values
print(output)