### Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import gensim
import gensim.models.doc2vec as doc2vec
from gensim.models.doc2vec import TaggedDocument
import re

### Load data & clean it up

In [2]:
df = pd.read_csv('/Users/ksmith/Documents/Code/DS1/Unit4_Project/build-post-here-DS/final_test_w_name.csv')
df = df.fillna('')
df['all_text'] = df['title'] + ' ' + df['text'] # Combining text & titles to one field

df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,subreddit_id,name,all_text
0,0,15 minutes wait to play more?,"So I dodged a game, all my mains banned or pic...",2rfxx,leagueoflegends,15 minutes wait to play more? So I dodged a ga...
1,1,League has to investigate possible fake game b...,Obvious inting in game 5 right before TSM win ...,2rfxx,leagueoflegends,League has to investigate possible fake game b...
2,2,"Xmithie...the Goat, I told you so",Here's what I said about Xmithie years ago...\...,2rfxx,leagueoflegends,"Xmithie...the Goat, I told you so Here's what ..."
3,3,Whatever happened to that Teacher in Korea who...,Anyone remember this ? It was way back like 20...,2rfxx,leagueoflegends,Whatever happened to that Teacher in Korea who...
4,4,Just got to diamond playing annie only :),Just wanted to share my excitement with you gu...,2rfxx,leagueoflegends,Just got to diamond playing annie only :) Just...
5,5,Hot Take: TSM vs TL is the Best Finals in LCS ...,"As a TSM fan it sucks to lose, but those games...",2rfxx,leagueoflegends,Hot Take: TSM vs TL is the Best Finals in LCS ...
6,6,Doublelift never meet Faker before in an offic...,"As the title, I just realized somehow Doubleli...",2rfxx,leagueoflegends,Doublelift never meet Faker before in an offic...
7,7,Griffin went balls to the walls to make LS loo...,after the whole LS Vs Reddit vs other casters ...,2rfxx,leagueoflegends,Griffin went balls to the walls to make LS loo...
8,8,Dodged ranked game due to client bug :),&#x200B;\r\n\r\nhttps://i.redd.it/fj8xph83t5s2...,2rfxx,leagueoflegends,Dodged ranked game due to client bug :) &#x200...
9,9,Decided to re-roll the skin shards I had saved...,https://imgur.com/Ezbocp6\r\n\r\nhttps://imgur...,2rfxx,leagueoflegends,Decided to re-roll the skin shards I had saved...


In [3]:
top_20 = ['AskReddit', 'dankmemes', 'memes', 'teenagers', 'aww', 'RocketLeagueExchange', 'Showerthoughts',
          'funny', 'me_irl', 'freefolk', 'gameofthrones', 'pics', 'NoStupidQuestions', 'AskOuija',
          'unpopularopinion', 'gaming', 'videos', 'politics', 'AmItheAsshole', 'Jokes']

data = df[df['name'].isin(top_20)]
data = data.drop(data.columns[0], axis=1) # Drop old row index
data.head()

Unnamed: 0,title,text,subreddit_id,name,all_text
543,[Spoilers] What is that one conspiracy theory ...,Arya is already dead and is actually Jaqen H'g...,2rjz2,gameofthrones,[Spoilers] What is that one conspiracy theory ...
544,[SPOILERS] The only reunion that I want to see...,,2rjz2,gameofthrones,[SPOILERS] The only reunion that I want to see...
545,[No Spoilers] How does rising the dead work?,Some stuff I wonder about.\r\n\r\nHow fresh do...,2rjz2,gameofthrones,[No Spoilers] How does rising the dead work? S...
546,[No Spoilers] House Targaryen sigil cross stitch,,2rjz2,gameofthrones,[No Spoilers] House Targaryen sigil cross stitch
547,[No Spoilers] Need help!!,My exams(very important) are going on and will...,2rjz2,gameofthrones,[No Spoilers] Need help!! My exams(very import...


In [4]:
df['all_text'] = df['all_text'].str.replace('http\S+|www.\S+', 'link', case=False)

### Doc2Vec & Logistic Regression

In [5]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data.all_text, data.name, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [7]:
all_data[:2]

[TaggedDocument(words=['Why', 'do', 'the', 'Brits', 'drive', 'on', 'the', 'opposite', 'side', 'of', 'the', 'car', 'and', 'the', 'road?'], tags=['Train_0']),
 TaggedDocument(words=['My', 'dog', 'is', 'miner', '#cz'], tags=['Train_1'])]

In [8]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 22695/22695 [00:00<00:00, 2854350.33it/s]
100%|██████████| 22695/22695 [00:00<00:00, 3548280.81it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2709796.44it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2764651.89it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2733294.93it/s]
100%|██████████| 22695/22695 [00:00<00:00, 3867614.55it/s]
100%|██████████| 22695/22695 [00:00<00:00, 3034515.90it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2770042.17it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2727421.26it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2710722.44it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2827977.70it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2773431.89it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2791160.25it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2950064.44it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2862590.72it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2760642.94it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2981200.42it/

In [9]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [10]:
subreddits = data.name.unique()

In [11]:
%%time

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_vectors_dbow, y_train)

CPU times: user 2min 23s, sys: 160 ms, total: 2min 23s
Wall time: 2min 23s


In [12]:
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=subreddits))

accuracy 0.5846673520340725
                      precision    recall  f1-score   support

       gameofthrones       0.86      0.67      0.75       103
           teenagers       0.65      0.54      0.59       200
           AskReddit       0.82      0.95      0.88      1635
                pics       0.26      0.19      0.22        74
           dankmemes       0.54      0.40      0.46       189
RocketLeagueExchange       0.94      0.96      0.95       337
                 aww       0.52      0.54      0.53       304
              me_irl       0.42      0.51      0.46       362
               funny       0.31      0.50      0.38       652
      Showerthoughts       0.39      0.29      0.34       259
               memes       0.13      0.03      0.05       294
    unpopularopinion       0.72      0.71      0.71       225
       AmItheAsshole       0.33      0.21      0.26       126
              videos       0.77      0.89      0.83       294
            freefolk       0.30      0.23

In [16]:
y_pred_proba = logreg.predict_proba(test_vectors_dbow)[:,1]
print('Validation ROC AUC:', roc_auc_score(y_test, y_pred_proba))

ValueError: multiclass format is not supported

### User input test prediction

In [17]:
input_title = ["Python tutorial"]
input_text = ["I'm looking for a good python tutorial for free online. Any suggestions?"]
all_input = input_title + input_text
user_input = model_dbow.infer_vector(all_input, steps=20)
user_input = user_input.reshape(1, -1)
# # user_vectors_dbow = get_vectors(model_dbow, len(input_test), 300, 'User')
print(logreg.predict(user_input))

['me_irl']
