In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import gensim
import gensim.models.doc2vec as doc2vec
from gensim.models.doc2vec import TaggedDocument
import re

In [90]:
df = pd.read_csv('final_test_w_name.csv')

In [91]:
top_20 = ['AskReddit', 'dankmemes', 'memes', 'teenagers', 'aww', 'RocketLeagueExchange',
         'Showerthoughts', 'funny', 'me_irl', 'freefolk', 'gameofthrones', 'pics',
         'NoStupidQuestions', 'AskOuija', 'unpopularopinion', 'gaming', 'videos', 'politics',
         'AmItheAsshole', 'Jokes']


### Cleaning 

In [195]:
# get only the subreddits that we want
data = df[df['name'].isin(top_20)].copy()

# mark which posts are missing body 
data['text'] = data['title'].fillna('NaNtext')

# combine text from title and body 
data['all_text'] = data['title']+ ' ' + data['text']

# remove links to make text processor happier
data['all_text'] = data['all_text'].str.replace('http\S+|www.\S+', 'link', case=False)

In [94]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled


In [97]:
X_train, X_test, y_train, y_test = train_test_split(data.all_text, data.name, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [296]:
X_test[0]

TaggedDocument(words=['Soldiers', 'of', 'Reddit,', 'what', 'were', 'the', 'most', 'unfortunate', 'name/rank', 'combinations', "you've", 'seen', 'and', 'how', 'did', 'it', 'work', 'out', 'for', 'them?', 'Soldiers', 'of', 'Reddit,', 'what', 'were', 'the', 'most', 'unfortunate', 'name/rank', 'combinations', "you've", 'seen', 'and', 'how', 'did', 'it', 'work', 'out', 'for', 'them?'], tags=['Test_0'])

In [99]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 22695/22695 [00:00<00:00, 1074473.19it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1797049.83it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1931805.77it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2170753.90it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1297516.86it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2260287.06it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2235077.82it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1718660.48it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2302383.16it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2366549.72it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2468741.36it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2287554.77it/s]
100%|██████████| 22695/22695 [00:00<00:00, 3013572.97it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2591959.95it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1939243.97it/s]
100%|██████████| 22695/22695 [00:00<00:00, 1996722.03it/s]
100%|██████████| 22695/22695 [00:00<00:00, 2121645.10it/

In [100]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

(300,)

In [101]:
subreddits = data.name.unique()

In [102]:
%%time

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_vectors_dbow, y_train)



CPU times: user 3min 8s, sys: 1.94 s, total: 3min 10s
Wall time: 3min 33s


In [None]:
y_pred = logreg.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=subreddits))



In [308]:
logreg.predict_proba([test_vectors_dbow[0]]).max()

0.7191816029206933

In [None]:
output = pd.DataFrame(logreg.predict_proba(test_vectors_dbow[0]), columns=logreg.classes_).T.nlargest(5, [0]).reset_index().values
print(output)

In [280]:
output = pd.DataFrame(logreg.predict_proba(test_vectors_dbow), 
                      columns=logreg.classes_).T.sort_values(0, ascending=False).head(5)

In [None]:
keys = list(output.keys())
values = list(output.values)

In [281]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6799,6800,6801,6802,6803,6804,6805,6806,6807,6808
AskReddit,0.719182,0.000166,0.001668,0.000233,0.06551,0.867567,0.039552,0.1133415,0.000211,0.000425,...,0.843565,0.01564,0.001175,0.000289,0.239206,0.723519,0.000632,0.001538,0.051981,0.003911
NoStupidQuestions,0.198292,0.000325,0.000565,0.00381,0.202043,0.0254,0.019585,9.497666e-08,0.006216,0.000861,...,0.001133,0.002168,0.109431,0.002457,0.008616,0.01481,0.004958,0.001651,0.00575,0.018593
aww,0.032162,0.053302,0.001234,0.057288,0.060131,0.00177,0.003011,0.1812237,0.041717,0.116237,...,5.8e-05,0.317034,0.013864,0.005272,0.021695,0.001646,0.077025,0.338731,4e-05,0.015457
videos,0.019397,0.002936,0.031,0.009492,0.007201,4.3e-05,0.002974,5.028156e-05,0.008597,0.052924,...,0.000627,4.9e-05,0.001204,0.011836,0.037536,0.060387,0.012377,0.007218,0.001456,0.001053
dankmemes,0.007865,0.10153,0.163477,0.23852,0.152549,0.060502,0.136373,0.001442626,0.137343,0.168862,...,0.005698,0.023708,0.351215,0.01892,0.084532,0.019951,0.20498,0.03919,0.020937,0.467817


In [218]:
output.T.nlargest(5, [0])[0]

AskReddit            0.719182
NoStupidQuestions    0.198292
aww                  0.032162
videos               0.019397
dankmemes            0.007865
Name: 0, dtype: float64

In [299]:
output.shape

(5, 6809)

In [224]:
keys = list(output.T.nlargest(5, [0])[0].keys())

In [225]:
values = list(output.T.nlargest(5, [0])[0].values)

In [231]:
reddits = {}
for k,v in zip(keys, values):
    reddits[k] = v


In [232]:
reddits

{'AskReddit': 0.7191816029206933,
 'NoStupidQuestions': 0.1982921569224143,
 'aww': 0.03216232376356752,
 'videos': 0.019397442005865632,
 'dankmemes': 0.007864788112708283}

In [241]:
print(list(reddits.keys())[0])
print(reddits[list(reddits.keys())[0]])
print(list(reddits.keys())[1])
print(reddits[list(reddits.keys())[1]])

AskReddit
0.7191816029206933
NoStupidQuestions
0.1982921569224143


In [111]:
print(y_pred[:3])

['AskReddit' 'freefolk' 'freefolk']


In [105]:
import pickle

# save doc2vec model
model_dbow.save("doc2vec_model.pkl")

# save pickle model
pickle.dump(logreg, open("logreg.pkl", 'wb'))

In [106]:
http://posthere.us-east-2.elasticbeanstalk.com/

In [34]:
import requests, json
url = "http://posthere.us-east-2.elasticbeanstalk.com/api"
data = {'title': 'We have liftoff', 'body': 'Mac successfully deployed the API', 'image': 'no image bruh'}

r = requests.post(url, data=json.dumps(data))

In [35]:
r.json()

{'top_five': [['dankmemes', 0.3352851840055816],
  ['memes', 0.26466100597315584],
  ['teenagers', 0.2511384937251988],
  ['funny', 0.04544956162290755],
  ['freefolk', 0.02041078781064245]]}

In [30]:
import requests, json
url = "http://buildweekredditpredict-env.zfm3nfznwp.us-east-1.elasticbeanstalk.com"
data = {'title': 'We have liftoff', 'body': 'Mac successfully deployed the API', 'image': 'no image bruh'}

r = requests.post(url, data=json.dumps(data))

In [31]:
r.json()

{'top_five': [['dankmemes', 0.33886038800121293],
  ['teenagers', 0.25876136585078297],
  ['memes', 0.24985797538726495],
  ['funny', 0.04479723964769656],
  ['gaming', 0.020337102222111735]]}

In [52]:
# http://buildweekredditpredict-env.zfm3nfznwp.us-east-1.elasticbeanstalk.com
# #http://deploy-env.pq57gi7hm8.us-east-1.elasticbeanstalk.com/
    
url = "http://deploy-env.pq57gi7hm8.us-east-1.elasticbeanstalk.com/"
data = {'int1': 1333, 'int2': 2222}

r = requests.post(url, data=json.dumps(data))

In [53]:
r.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [11]:
r.json()['top_five']

[['dankmemes', 0.3869077753350462],
 ['teenagers', 0.23941194829905826],
 ['memes', 0.22666111663762145],
 ['funny', 0.037268128860909806],
 ['aww', 0.03133052814250931]]

In [4]:
r.json()['top_five']

[['dankmemes', 0.2688623753960543],
 ['memes', 0.21896358089555118],
 ['me_irl', 0.1388451406380714],
 ['funny', 0.09959079999149276],
 ['teenagers', 0.09541713668949187]]

In [9]:
r.json()['top_five']

[['dankmemes', 0.3676956062084892],
 ['memes', 0.24361949634908672],
 ['teenagers', 0.16162106985196262],
 ['funny', 0.07844333309922076],
 ['freefolk', 0.03391817188333424]]

In [10]:
import requests, json
url = "http://127.0.0.1:5000/"
data = {'int1': 144, 'int2': 444}

r = requests.post(url, data=json.dumps(data))

In [11]:
r.json()

[['RESULT', 500], ['report', 600], ['show_inp', 700]]