In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from sklearn import utils
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import nltk
from nltk.corpus import stopwords
import multiprocessing
import re

In [2]:
BASE_DIR = "input/"
columns = ['Text', 'Class']
base = pd.DataFrame(columns=columns)

In [3]:
dfs = []
for file in os.listdir(BASE_DIR):
    path = os.path.join(BASE_DIR, file)
    df = pd.read_csv(path, sep="\t", header=None, names=columns)
    dfs.append(df)
base = pd.concat(dfs, ignore_index=True)

In [4]:
def clean_text(text):
    text = re.sub("[^a-zA-Z]"," ",text)
    return text.lower().split() 

In [5]:
train, test = train_test_split(base, test_size=0.2, random_state=0)

In [6]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [7]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.Class]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['Text']), tags=[r.Class]), axis=1)

In [8]:
cores = multiprocessing.cpu_count()

In [9]:
model_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 2198/2198 [00:00<00:00, 1072110.73it/s]


In [10]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 2198/2198 [00:00<00:00, 451694.28it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1177705.70it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1374341.11it/s]
100%|██████████| 2198/2198 [00:00<00:00, 921171.08it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1355149.23it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1392820.70it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1460564.04it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1277057.79it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1549946.23it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1480025.72it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1269670.87it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1394717.12it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1284531.17it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1093085.15it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1313072.24it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1451823.65it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1191249.54it/s]
100%|██████████| 2198/2198 [00:00

CPU times: user 3.92 s, sys: 658 ms, total: 4.58 s
Wall time: 3.61 s


In [11]:
model_dbow.wv.most_similar('love')

  if np.issubdtype(vec.dtype, np.int):


[('opening', 0.33626219630241394),
 ('big', 0.2923218905925751),
 ('earpiece', 0.29195964336395264),
 ('sashimi', 0.28428804874420166),
 ('nyc', 0.2626972794532776),
 ('ipod', 0.26215213537216187),
 ('focus', 0.26033657789230347),
 ('as', 0.2594040036201477),
 ('dessert', 0.25527170300483704),
 ('choice', 0.2537538707256317)]

In [12]:
model_dbow.wv.most_similar('hate')

[('possibly', 0.379730224609375),
 ('wait', 0.29852789640426636),
 ('charged', 0.2856979966163635),
 ('problem', 0.2828962802886963),
 ('40', 0.28117772936820984),
 ('including', 0.2790653705596924),
 ('james', 0.2694108486175537),
 ('children', 0.2665039598941803),
 ('seeing', 0.2647099494934082),
 ('sorry', 0.2608293294906616)]

In [13]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [14]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

In [16]:
grid.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
y_pred = grid.predict(X_test)

In [18]:
accuracy_score(y_test, y_pred)

0.78

In [19]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 2198/2198 [00:00<00:00, 746605.13it/s]


In [20]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 2198/2198 [00:00<00:00, 570487.64it/s]
100%|██████████| 2198/2198 [00:00<00:00, 656723.19it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1234973.90it/s]
100%|██████████| 2198/2198 [00:00<00:00, 906141.16it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1192019.68it/s]
100%|██████████| 2198/2198 [00:00<00:00, 698415.17it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1530391.80it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1079012.19it/s]
100%|██████████| 2198/2198 [00:00<00:00, 950322.67it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1102892.71it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1378657.12it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1492243.48it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1179815.74it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1353358.81it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1340177.38it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1476233.82it/s]
100%|██████████| 2198/2198 [00:00<00:00, 1414620.25it/s]
100%|██████████| 2198/2198 [00:00<00

CPU times: user 7.71 s, sys: 1.52 s, total: 9.23 s
Wall time: 6 s


In [21]:
model_dmm.wv.most_similar('love')

  if np.issubdtype(vec.dtype, np.int):


[('rocks', 0.7492744326591492),
 ('personally', 0.746706485748291),
 ('falafels', 0.6939298510551453),
 ('baklava', 0.6934903860092163),
 ('comforting', 0.6809792518615723),
 ('drooling', 0.6801717877388),
 ('general', 0.6790015697479248),
 ('baba', 0.674885630607605),
 ('aye', 0.673746645450592),
 ('detailing', 0.6674149036407471)]

In [22]:
model_dmm.wv.most_similar('hate')

[('those', 0.8215619921684265),
 ('movies', 0.8087788820266724),
 ('austen', 0.8035611510276794),
 ('scream', 0.8005440831184387),
 ('zombie', 0.7980297207832336),
 ('supernatural', 0.7947716116905212),
 ('extant', 0.7735164165496826),
 ('visual', 0.7508927583694458),
 ('skip', 0.7433120608329773),
 ('wonderfully', 0.7410240173339844)]

In [23]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

In [None]:
grid.fit(X_train, y_train)

In [25]:
grid.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
y_pred = grid.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred)

0.7272727272727273

In [28]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [29]:
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [30]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [31]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [None]:
grid.fit(X_train, y_train)

In [33]:
grid.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
y_pred = grid.predict(X_test)

In [35]:
accuracy_score(y_test, y_pred)

0.7709090909090909