In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
train = pd.read_csv("latest_cleaned_data/train_lemmatized_withstop.csv")
train = train[~train['user_review'].isna()]
train = train[train['user_review'] != ' ']
print("ORIGINAL SHAPE", train.shape)

ORIGINAL SHAPE (17471, 5)


In [4]:
user_review = train.user_review.values
count = 0
corpus_vec = np.zeros((len(user_review),300))
docs = nlp.pipe(user_review, disable=['ner','tagger','parser'])
pbar = tqdm(total = user_review.shape[0])
for doc in docs:
    sent_vec = []
    for token in doc:
        if token.has_vector:
            sent_vec.append(list(token.vector))
    sent_vec = np.array(sent_vec)
    corpus_vec[count] = np.sum(sent_vec,axis=0)
    if count%1000==0:
        pbar.update(1000)
    count+= 1
pbar.close()

18000it [02:16, 132.19it/s]                           


## SVD and Xgboost with Embeddings

In [5]:
D = np.argwhere(np.isnan(corpus_vec))
idx = list(set(D[:,0]))
train.iloc[idx,:]

Unnamed: 0,review_id,title,year,user_review,user_suggestion


In [None]:
kf = KFold(n_splits=5,shuffle=True)
for train_idx,test_idx in kf.split(corpus_vec):
    X_train,y_train = corpus_vec[train_idx],train.user_suggestion.values[train_idx]
    X_test,y_test = corpus_vec[test_idx],train.user_suggestion.values[test_idx]
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    print(f1_score(y_test,y_pred))

In [None]:
from sklearn.svm import SVC
kf = KFold(n_splits=5,shuffle=True)
for train_idx,test_idx in kf.split(corpus_vec):
    X_train,y_train = corpus_vec[train_idx],train.user_suggestion.values[train_idx]
    X_test,y_test = corpus_vec[test_idx],train.user_suggestion.values[test_idx]
    lr = SVC(kernel='linear')
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    print(f1_score(y_test,y_pred))

In [None]:
from xgboost import XGBClassifier
for train_idx,test_idx in kf.split(corpus_vec):
    X_train,y_train = corpus_vec[train_idx],train.user_suggestion.values[train_idx]
    X_test,y_test = corpus_vec[test_idx],train.user_suggestion.values[test_idx]
    lr = XGBClassifier()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    print(f1_score(y_test,y_pred))

## NN with Embeddings

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

In [None]:
X_train,X_test,y_train,y_test = train_test_split(corpus_vec,train.user_suggestion.values,stratify=train.user_suggestion.values,test_size=0.3)
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape
X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
y_train = torch.from_numpy(y_train).type(torch.IntTensor)
y_test = torch.from_numpy(y_test).type(torch.IntTensor)

In [None]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
train_dataset = TensorDataset(X_train,y_train)
train_dataloader = DataLoader(train_dataset,batch_size=64)
test_dataset = TensorDataset(X_test,y_test)
test_dataloader = DataLoader(test_dataset,batch_size=64)

In [None]:
class BuyorNot(nn.Module):
    def __init__(self):
        super(BuyorNot,self).__init__()
        
        self.fc1 = nn.Linear(300,600)
        self.bn1 = nn.BatchNorm1d(num_features=600)
        self.fc2 = nn.Linear(600,600)
        self.bn2 = nn.BatchNorm1d(num_features=600)
        self.fc3 = nn.Linear(600,300)
        self.bn3 = nn.BatchNorm1d(num_features=300)
        self.fc4 = nn.Linear(300,64)
        self.bn4 = nn.BatchNorm1d(num_features=64)
        self.out = nn.Linear(64,1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, inputs):
        x = self.relu(self.bn1(self.fc1(inputs)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.relu(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = BuyorNot()
model.to(device)
print(model)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
testing_x,testing_label = next(iter(train_dataloader))

In [None]:
from sklearn.metrics import f1_score
def f1_score_torch(y_true,y_pred):
    y_true_numpy = y_true.detach().cpu().numpy().astype(int)
    y_pred_numpy = y_pred.detach().cpu().numpy().astype(int)
    score = f1_score(y_true_numpy,y_pred_numpy)
    return score

In [None]:
for e in range(EPOCHS):
    model.train()
    epoch_loss = 0
    f1_score_ = 0
    for X_train,y_train in train_dataloader:
        X_train,y_train = X_train.to(device),y_train.to(device)
        optimizer.zero_grad()
        y_pred = model(X_train)
        y_pred_labels = (y_pred>0.5).float()
        loss = criterion(y_pred, y_train.unsqueeze(1).float())
        loss.backward()
        f1_score_ += f1_score_torch(y_train, y_pred_labels.squeeze(1))
        epoch_loss += loss.item()
    print("EPOCH {} LOSS {:.5f} F1 {:.5f}".format(e,epoch_loss/len(train_dataloader),f1_score_/len(train_dataloader)))