In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Doc2Vec
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from sklearn import naive_bayes
from datasets import load_dataset, Dataset
import string
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score,precision_score,recall_score,f1_score

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
imdb = load_dataset('imdb',split='train')
imdb_test = load_dataset('imdb',split='test')
imdb_test = imdb_test.shard(5,3)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
docs = []

In [7]:
def remove_stop_words_and_puncts(words):
    f1 = [w.lower() for w in words if w.lower() not in stop_words]
    f2 = [w for w in f1 if w not in string.punctuation]
    return f2

In [8]:
def process_text(text):
    words = word_tokenize(text['text'])
    clean_words = remove_stop_words_and_puncts(words)
    return clean_words

In [9]:
for example in imdb:
    processed_text = process_text(example)
    td = TaggedDocument(processed_text,[example['label']])
    docs.append(td)

In [10]:
print(len(docs))

25000


In [11]:
print(docs[5])

TaggedDocument<['would', 'put', 'top', 'list', 'films', 'category', 'unwatchable', 'trash', 'films', 'bad', 'worst', 'kind', 'ones', 'unwatchable', 'suppose', 'like', 'supposed', 'good', 'sex', 'sequences', 'shocking', 'day', 'could', "n't", 'even', 'arouse', 'rabbit', 'called', 'controversial', 'politics', 'strictly', 'high', 'school', 'sophomore', 'amateur', 'night', 'marxism', 'film', 'self-consciously', 'arty', 'worst', 'sense', 'term', 'photography', 'harsh', 'grainy', 'black', 'white', 'scenes', 'focus', 'taken', 'wrong', 'angle', 'even', 'sound', 'bad', 'people', 'call', 'art', 'br', 'br'], [0]>


In [12]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=50)

In [13]:
model.build_vocab(docs)

In [14]:
model.corpus_count

25000

In [15]:
model.epochs

50

In [13]:
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

In [14]:
model.save("Models/d2v")

In [15]:
model.infer_vector(docs[7].words)

array([-0.67601943,  0.93965364,  0.97789127,  0.8170497 , -0.10962925,
       -0.99066156, -1.3751618 , -0.12664947,  0.46178648, -0.33643317,
        0.54485947,  2.0527635 , -2.1078134 ,  0.4092491 , -1.0821798 ,
       -0.70922816,  0.23238902,  0.46199992, -1.4246432 ,  0.5670182 ,
       -3.694571  , -0.6030176 ,  1.6715662 ,  1.3273951 ,  3.4087079 ,
       -2.1526668 , -1.4068826 , -2.026342  , -1.3571855 , -0.17524837,
        0.20602168, -0.06980582,  0.4116473 , -0.93217504, -1.3114256 ,
        1.177796  , -0.917384  , -0.72466713, -0.31843653,  0.74867594,
       -0.9680626 ,  1.6507361 , -0.17520931, -2.4065385 , -0.6273512 ,
       -0.62019545, -0.50292546, -0.5915153 ,  1.3275881 , -0.82715756,
       -0.8632329 ,  0.3381379 , -0.02008809, -1.189614  , -1.697198  ,
        2.5297508 ,  0.25109625, -1.3971705 , -1.9840455 , -0.32655463,
       -0.9176103 ,  2.5087218 ,  1.4125646 , -1.0020009 , -1.3469801 ,
       -0.9717302 ,  1.1538336 ,  0.13108045, -0.44742957, -0.76

In [16]:
training_X = []
training_y = []

In [17]:
for doc in docs:
    training_X.append(doc.words)
    training_y.append(doc.tags[0])

In [18]:
x_arr = np.zeros((len(docs),100))

In [19]:
for i in range(len(training_X)):
    x_arr[i] = model.infer_vector(training_X[i])

In [23]:
lgr = LogisticRegression()
lgr.fit(x_arr,training_y)

In [26]:
lgr.predict(x_arr[10].reshape(1,-1))

array([0])

In [27]:
x_test = np.zeros((imdb_test.shape[0],100))
y_test = np.zeros(imdb_test.shape[0])

In [31]:
idx = 0
for t in imdb_test:
    processed_text = process_text(t)
    vct = model.infer_vector(processed_text)
    x_test[idx] = vct
    y_test[idx] = t['label']
    idx += 1

In [33]:
print(x_test[:3])

[[-0.79555535  1.58359873  1.25062001  0.43698427  0.53575784 -0.9378283
   0.37233049 -0.41609702  1.19482076 -1.35438955  1.35854256  0.40980822
  -0.89469182  0.70664155  0.88739145 -0.22871792  0.48716381 -0.41121709
  -2.36034632  2.64456868 -0.26536545 -1.0380981   0.11961165  3.44253945
   3.01575994 -1.03065574 -0.95411897 -0.00673487  0.28041366 -0.63625789
   1.14203644  0.55029058 -0.66773772 -0.45196268 -0.15259083 -0.74103934
   0.61703199 -1.15230823 -1.12608504  2.04952574 -0.55084699 -0.47903171
  -1.29100502 -2.28025889 -1.20470333  1.50976396  0.65362912  0.86997867
   0.44186091  0.09131063  1.12809777 -1.16487277  2.37690163 -0.24818978
  -0.25535709  0.68692905 -0.03259113 -0.28290048 -0.90286255 -0.03442962
   2.55789709 -0.20829272 -0.34281975 -0.77076715 -1.72009063 -1.87578499
   0.46475857  1.22199512 -0.30730927  0.37041128  0.41026881  3.50078988
  -1.71883607  0.50733852 -0.88497061  1.87057912 -1.87126911 -1.37723303
  -2.46489358  1.69312561  0.8017717  -

In [40]:
predicted_probs = lgr.predict_proba(x_test)

In [41]:
ll = log_loss(y_test,predicted_probs)

In [42]:
ll

0.6587455265331927

In [43]:
predicted = lgr.predict(x_test)

In [47]:
confusion_matrix(y_test,predicted)

array([[2113,  387],
       [ 520, 1980]], dtype=int64)

In [50]:
ps = precision_score(y_test,predicted)

In [51]:
ps

0.8365019011406845

In [52]:
rc = recall_score(y_test,predicted)

In [53]:
rc

0.792

In [54]:
ac = accuracy_score(y_test,predicted)

In [55]:
ac

0.8186

In [58]:
f1 = f1_score(y_test,predicted)

In [59]:
f1

0.8136429011711528