In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import torch
import warnings
import multiprocessing
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import pickle

warnings.filterwarnings('ignore')

In [11]:
data = pd.read_csv("preprocessed_data.csv")

In [12]:
data.head()

Unnamed: 0,title,text,class
0,interpreting syntactic social element tweet re...,research social medium analysis experiencing r...,0
1,living together mind machine intelligence,paper consider nature machine intelligence cre...,0
2,stochastic local search pattern set mining,local search method quickly find good quality ...,0
3,sparse inverse covariance matrix estimation us...,l regularized gaussian maximum likelihood esti...,0
4,visual translation embedding network visual re...,visual relation person ride bike bike next car...,0


In [15]:
X = data.text
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
cores = multiprocessing.cpu_count()

model = Word2Vec(min_count=20,
                      window=5,
                      vector_size=300,
                      sample=6e-5,
                      alpha=0.03,
                      min_alpha=0.0007,
                      negative=20,
                      workers=cores-1)

sent = [row.split() for row in X_train]
model.build_vocab(sent, progress_per=10000)
with tqdm(total=model.corpus_count, desc="Training Word2Vec") as pbar:
    model.train(sent, total_examples=model.corpus_count, epochs=30, report_delay=1)
    pbar.update(model.corpus_count)

Training Word2Vec: 100%|████████████████████████████████████████████████████████| 32800/32800 [03:13<00:00, 169.09it/s]


In [49]:
def get_mean_w2v_vector(sentence):
    Sum = 0
    Count = 0

    try:
        words = str(sentence).split()
    except TypeError:
        words = []

    for w in words:
        if w in model.wv:
            Sum += model.wv[w]
            Count += 1

    if Count == 0:
        return np.zeros(model.vector_size)

    return Sum / Count


def prepare_data(X, y, vectors_dim = 300):
    HIDDEN = vectors_dim
    X_vectors = X.map(get_mean_w2v_vector)
    X = pd.DataFrame(X_vectors.tolist(), index=X.index)
    return X, y

In [50]:
X_train_vec, y_train_vec = prepare_data(X_train, y_train)
X_test_vec, y_test_vec = prepare_data(X_test, y_test)  

In [51]:
logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train_vec)

In [52]:
predictions = logreg.predict(X_test_vec)

In [53]:
accuracy = accuracy_score(y_test_vec, predictions)
f1 = f1_score(y_test_vec, predictions, average='weighted')

print("Accuracy: ", accuracy)
print("F1 Score: ", f1)

Accuracy:  0.853780487804878
F1 Score:  0.8290199167209806


In [57]:
pickle.dump(model, open('./models/word2vecmodel.pkl', 'wb'))

In [58]:
pickle.dump(logreg, open('./models/logreg.pkl', 'wb'))