In [167]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import torch
import torch.nn as nn

from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset


In [189]:
df = pd.read_csv('train.csv', names=['label', 'review'])
df = df.iloc[:2000, :]
df['label'] -= 1


In [190]:
tokenizer = RegexpTokenizer(r'[a-z]+')

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
             "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
             'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
             'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
             'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
             'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
             'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
             'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than',
             'too', 'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now', 'd', 'll', 'm', 'o',
             're', 've', 'y', 'ma' 'st', 'nd', 'rd', 'th', "you'll", 'dr', 'mr', 'mrs']

def tokenize(sent):
    txt = sent.lower()
    tokens = tokenizer.tokenize(txt)
    tokens = [word for word in tokens if not word in stopwords]
    return tokens


In [191]:
df['tokens'] = df['review'].apply(tokenize)

In [192]:
df.head()

Unnamed: 0,label,review,tokens
0,0,"Unfortunately, the frustration of being Dr. Go...","[unfortunately, frustration, goldberg, patient..."
1,1,Been going to Dr. Goldberg for over 10 years. ...,"[going, goldberg, years, think, one, st, patie..."
2,0,I don't know what Dr. Goldberg was like before...,"[don, know, goldberg, like, moving, arizona, l..."
3,0,I'm writing this review to give you a heads up...,"[writing, review, give, heads, see, doctor, of..."
4,1,All the food is great here. But the best thing...,"[food, great, best, thing, wings, wings, simpl..."


In [193]:
vec_size = 100
model = gensim.models.Word2Vec(
    vector_size=vec_size, window=5, min_count=3, workers=8)
model.build_vocab(df['tokens'])
model.train(df['tokens'], total_examples=model.corpus_count, epochs=10)


(12612601, 13774210)

In [194]:
# average w2v for a sentence

def sent_vec(sent):
    wv_vec = np.zeros(vec_size)
    count = 0

    for word in sent:
        if word in model.wv:
            count += 1
            wv_vec += model.wv[word]
    
    if count > 0:
        wv_vec /= count
    return wv_vec

In [195]:
df['vector'] = df['tokens'].apply(sent_vec)

In [196]:
df.head()

Unnamed: 0,label,review,tokens,vector
0,0,"Unfortunately, the frustration of being Dr. Go...","[unfortunately, frustration, goldberg, patient...","[0.46274383323123824, 0.4110868980105107, 0.34..."
1,1,Been going to Dr. Goldberg for over 10 years. ...,"[going, goldberg, years, think, one, st, patie...","[0.11548505241380018, 0.25598243892411976, -0...."
2,0,I don't know what Dr. Goldberg was like before...,"[don, know, goldberg, like, moving, arizona, l...","[0.335316833092685, 0.08530903221381937, 0.457..."
3,0,I'm writing this review to give you a heads up...,"[writing, review, give, heads, see, doctor, of...","[0.6147377874141742, 0.29813957111621947, 0.33..."
4,1,All the food is great here. But the best thing...,"[food, great, best, thing, wings, wings, simpl...","[-0.2751818628431382, -0.16247301599518818, -0..."


In [197]:
class SentimentDataseet(Dataset):
    def __init__(self, vector, label):
        self.vector = vector
        self.label = label

    def __len__(self):
        return len(self.vector)
    
    def __getitem__(self, idx):
        return self.vector[idx], self.label[idx]

In [198]:
class SimpleModel(nn.Module):
    def __init__(self, hidden_units):
        super(SimpleModel, self).__init__()
        self.hidden = nn.Linear(vec_size, hidden_units)
        self.out = nn.Linear(hidden_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.out(x)
        x = self.sigmoid(x)
        return x

In [199]:
x = torch.tensor(df['vector'], dtype=torch.float32)
y = torch.tensor(df['label'], dtype=torch.float32).unsqueeze(1)

In [200]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [201]:
train_data = SentimentDataseet(x_train, y_train)
val_data = SentimentDataseet(x_val, y_val)
test_data = SentimentDataseet(x_test, y_test)

In [202]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)


In [203]:
def binary_acc(y_pred, y_test):
    y_h = torch.round(y_pred)
    crct_results = (y_h == y_test).sum()

    # print(crct_results)

    return crct_results / y_test.size(0)


In [204]:
model = SimpleModel(64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

In [None]:
EPOCHS = 100

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}')
    # train
    model.train()
    train_loss = 0
    train_acc = 0

    for vecs, labels in train_loader:
        # forward
        outputs = model(vecs)
        loss = criterion(outputs, labels)
        acc = binary_acc(outputs, labels)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += acc.item()

    train_loss /= len(train_data)
    train_acc /= len(train_data)

    # valid
    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        for vecs, labels in val_loader:
            outputs = model(vecs)
            loss = criterion(outputs, labels)
            acc = binary_acc(outputs, labels)

            val_loss += loss.item()
            val_acc += acc.item()

    val_loss /= len(val_data)
    val_acc /= len(train_data)

    print(
        f'Train loss: {train_loss}, Train acc: {train_acc}, Val loss: {val_loss}, Val acc: {val_acc}')


In [None]:
####################### 

In [206]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [207]:
clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [208]:
from sklearn import metrics
predicted = clf.predict(x_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))

Logistic Regression Accuracy: 0.8725
