In [3]:
import pandas as pd
import numpy as np
import gensim.downloader as api
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.calibration import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
DATA_PATH = './data.tsv'
RANDOM_SEED = 42

In [5]:
google_w2v_model = api.load('word2vec-google-news-300')

In [6]:
result = google_w2v_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0])

similarity = google_w2v_model.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)

result = google_w2v_model.most_similar(positive=['fastest', 'slow'], negative=['fast'], topn=1)
print("Fastest - Fast + Slow =", result[0])

King - Man + Woman = ('queen', 0.7118193507194519)
Similarity between 'excellent' and 'outstanding': 0.5567486
Fastest - Fast + Slow = ('slowest', 0.7025300860404968)


In [7]:
review_data = pd.read_table(DATA_PATH, on_bad_lines='skip', usecols=['star_rating', 'review_body'])
review_data['star_rating'] = pd.to_numeric(review_data['star_rating'], errors='coerce')
review_data.dropna(inplace=True)
review_data['target'] = review_data['star_rating'].apply(lambda x: 1 if x >= 4 else 0)

  review_data = pd.read_table(DATA_PATH, on_bad_lines='skip', usecols=['star_rating', 'review_body'])


In [8]:
positive_reviews = review_data[review_data['target'] == 1].sample(n=50000, random_state=RANDOM_SEED)
negative_reviews = review_data[review_data['target'] == 0].sample(n=50000, random_state=RANDOM_SEED)
final_data = pd.concat([positive_reviews, negative_reviews], axis=0)

In [9]:
final_data['review_body'] = final_data['review_body'].str.lower()
final_data['tokens'] = final_data['review_body'].apply(word_tokenize)

In [10]:
embedding_size = 300
window_size = 13
min_word_count = 9

amazon_w2v_model = Word2Vec(sentences=final_data['tokens'], vector_size=embedding_size, window=window_size, min_count=min_word_count)
amazon_w2v_model.save('amazon_w2v_model.model')

In [27]:
result = amazon_w2v_model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0])

similarity = amazon_w2v_model.wv.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)

result = amazon_w2v_model.wv.most_similar(positive=['fastest', 'slow'], negative=['fast'], topn=1)
print("Fastest - Fast + Slow =", result[0])

King - Man + Woman = ('author', 0.5631979703903198)
Similarity between 'excellent' and 'outstanding': 0.87626386
Fastest - Fast + Slow = ('warm-up', 0.4784909784793854)


### What do you conclude from comparing vectors generated by yourself and the pretrained model?  
The vectors generated by the pretrained model are more accurate and have a higher cosine similarity than the vectors generated by myself. This is because the pretrained model has been trained on a much larger dataset than the dataset I used to train my model.  
### Which of the Word2Vec models seems to encode semantic similarities between words better?  
The google news model performs better in general

In [14]:
def get_average_word2vec(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [15]:
final_data['average_word2vec'] = final_data['tokens'].apply(lambda x: get_average_word2vec(x, google_w2v_model))

# Split the dataset into a training and testing set
X_w2v = np.vstack(final_data['average_word2vec'].values)
y_w2v = final_data['target'].values
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, y_w2v, test_size=0.2, random_state=RANDOM_SEED)

In [73]:
X_tfidf = final_data['review_body'].values
y_tfidf = final_data['target'].values
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=RANDOM_SEED)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf = tfidf_vectorizer.transform(X_test_tfidf)

In [55]:
# Train a Perceptron model
perceptron_model = Perceptron()
perceptron_model.fit(X_train_w2v, y_train_w2v)
y_pred_perceptron = perceptron_model.predict(X_test_w2v)

accuracy_perceptron_word2vec = accuracy_score(y_test_w2v, y_pred_perceptron)

print("Accuracy for Perceptron (Word2Vec):", accuracy_perceptron_word2vec)

perceptron_model = Perceptron()
perceptron_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_perceptron = perceptron_model.predict(X_test_tfidf)

accuracy_perceptron_tfidf = accuracy_score(y_test_tfidf, y_pred_perceptron)

print("Accuracy for Perceptron (TF-IDF):", accuracy_perceptron_tfidf)

Accuracy for Perceptron (Word2Vec): 0.79855
Accuracy for Perceptron (TF-IDF): 0.81835


In [57]:
svm_model = LinearSVC()
svm_model.fit(X_train_w2v, y_train_w2v)
y_pred_svm = svm_model.predict(X_test_w2v)

accuracy_svm_word2vec = accuracy_score(y_test_w2v, y_pred_svm)

print("Accuracy for SVM (Word2Vec):", accuracy_svm_word2vec)

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)

accuracy_svm_tfidf = accuracy_score(y_test_tfidf, y_pred_svm)

print("Accuracy for SVM (TF-IDF):", accuracy_svm_tfidf)



Accuracy for SVM (Word2Vec): 0.8184




Accuracy for SVM (TF-IDF): 0.86485


### What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained 2 Word2Vec features)?  


In [None]:
X_train_w2v_tensor = torch.tensor(X_train_w2v, dtype=torch.float32)
y_train_w2v_tensor = torch.tensor(y_train_w2v, dtype=torch.long)

X_test_w2v_tensor = torch.tensor(X_test_w2v, dtype=torch.float32)
y_test_w2v_tensor = torch.tensor(y_test_w2v, dtype=torch.long)

In [175]:
class MLPModel(nn.Module):
    def __init__(self, input_layer_size):
        super(MLPModel, self).__init__()
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.fc1 = nn.Linear(input_layer_size, 50)
        self.fc2 = nn.Linear(50, 5)
        self.fc3 = nn.Linear(5, 2) 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

model = MLPModel(300)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

num_epochs = 1000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_w2v_tensor)
    loss = criterion(outputs, y_train_w2v_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    model.eval()
    outputs = model(X_test_w2v_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_w2v_tensor, predicted)
    print(f"Accuracy using all features in a Feedforward network: {accuracy}")

Accuracy using all features in a Feedforward network: 0.742


In [12]:
def get_first_ten_word2vec(tokens, model):
    vectors = []
    for i in range(10):
        if i < len(tokens) and tokens[i] in model:
            vectors.append(model[tokens[i]])
        else:
            vectors.append(np.zeros(model.vector_size))
    return np.array(vectors)

def get_first_ten_flattened_word2vec(tokens, model):
    return get_first_ten_word2vec(tokens, model).flatten()

In [20]:
final_data['first_ten_words_word2vec'] = final_data['tokens'].apply(lambda x: get_first_ten_flattened_word2vec(x, google_w2v_model))
X_w2v_3000 = np.vstack(final_data['first_ten_words_word2vec'].values)
X_train_w2v_3000, X_test_w2v_3000, y_train_w2v_3000, y_test_w2v_3000 = train_test_split(X_w2v_3000, y_w2v, test_size=0.2, random_state=RANDOM_SEED)

X_train_w2v_3000_tensor = torch.tensor(X_train_w2v_3000, dtype=torch.float32)
y_train_w2v_3000_tensor = torch.tensor(y_train_w2v_3000, dtype=torch.long)

X_test_w2v_3000_tensor = torch.tensor(X_test_w2v_3000, dtype=torch.float32)
y_test_w2v_3000_tensor = torch.tensor(y_test_w2v_3000, dtype=torch.long)    

In [141]:
model = MLPModel(3000)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_w2v_3000_tensor)
    loss = criterion(outputs, y_train_w2v_3000_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    model.eval()
    outputs = model(X_test_w2v_3000_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_w2v_3000_tensor, predicted)
    print(f"Accuracy using all features in a Feedforward network: {accuracy}")

Accuracy using all features in a Feedforward network: 0.6453


### What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section

In [31]:
X_train_w2v_rnn_tensor = X_train_w2v_3000_tensor.reshape(80000, 10, 300)
X_test_w2v_rnn_tensor = X_test_w2v_3000_tensor.reshape(20000, 10, 300)

In [43]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, rnn_type="rnn"):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size

        if rnn_type == "rnn":
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        elif rnn_type == "lstm":
            self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        
        if isinstance(self.rnn, nn.LSTM):
            c0 = torch.zeros(1, x.size(0), self.hidden_size)
            out, _ = self.rnn(x, (h0, c0))
        else:
            out, _ = self.rnn(x, h0)
        
        out = self.fc(out[:, -1, :])
        return out

Accuracy: 0.6403


In [46]:
input_size = 300
hidden_size = 10
num_classes = 2
rnn = RNNModel(input_size, hidden_size, num_classes, rnn_type="rnn")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

num_epochs = 200
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = rnn(X_train_w2v_rnn_tensor)
    loss = criterion(outputs, y_train_w2v_3000_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    rnn.eval()
    outputs = rnn(X_test_w2v_rnn_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_w2v_3000_tensor, predicted)
    print(f"Accuracy: {accuracy}")

Accuracy: 0.7577


In [47]:
input_size = 300
hidden_size = 10
num_classes = 2
rnn = RNNModel(input_size, hidden_size, num_classes, rnn_type="gru")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = rnn(X_train_w2v_rnn_tensor)
    loss = criterion(outputs, y_train_w2v_3000_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    rnn.eval()
    outputs = rnn(X_test_w2v_rnn_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_w2v_3000_tensor, predicted)
    print(f"Accuracy: {accuracy}")


Accuracy: 0.77025


In [48]:
input_size = 300
hidden_size = 10
num_classes = 2
rnn = RNNModel(input_size, hidden_size, num_classes, rnn_type="lstm")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)

for epoch in range(num_epochs):
    print("Epoch:", epoch)
    optimizer.zero_grad()
    outputs = rnn(X_train_w2v_rnn_tensor)
    loss = criterion(outputs, y_train_w2v_3000_tensor)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    rnn.eval()
    outputs = rnn(X_test_w2v_rnn_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_w2v_3000_tensor, predicted)
    print(f"Accuracy: {accuracy}")


Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 98
Epoch: 99
Epoch: 100