In [66]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import seaborn as sns

# This notebook what we used to compare Skip-gram and CBOW models

We first start by loading the cleaned data that we previously saved.

In [67]:
directory = './data/'
cleaned_data = pd.read_csv(directory + 'train_cleaned.txt')
# shufle the data
cleaned_data = cleaned_data.sample(frac=1).reset_index(drop=True)

We get our features from the data_frame as well as our labels.

In [None]:
X = [tweet.split()for tweet in cleaned_data['text']]
y = cleaned_data['label'].values 

Since we will use Cross Entropy Loss we need our labels to be either 0 or 1.

In [70]:
y[y < 0] = 0
y[y > 0] = 1
y = y.astype(int)

We then learn the embeddings for our features using Word2Vec. sg=1 means that we are using Skip-Gram when it is sg=0 we are using CBOW.

In [74]:
#FT legth of embedding
embeddingLength = 300
# Train the word2vec model on your tweets 
w2v = Word2Vec(X, min_count=4, vector_size=embeddingLength, sg=1)

We create a function that create the embedding for each tweet, by averaging the embeddings of the words in the tweet.

In [76]:
# perform word embedding on X_train by replacing each word with its embedding if it exists and averaging the embeddings of all words in a tweet
def vectorizeTweets(tweets, w2v):
    embedding = []
    for tweet in tweets:
        tweetEmbedding = []
        for word in tweet:
            if word in w2v.wv:
                tweetEmbedding.append(w2v.wv[word])
        if len(tweetEmbedding) == 0:
            tweetEmbedding.append(np.zeros(w2v.wv.vectors.shape[1]))
        embedding.append(np.mean(tweetEmbedding, axis=0))
    return np.array(embedding)

In [77]:
X = vectorizeTweets(X, w2v)

We now define the model that we will use. We use a simple 3 layer neural network with 200 hidden units, and define the loss function and the optimizer as Cross Entropy Loss and Adam respectively.

In [96]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self, embeddingLength):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(embeddingLength, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 2)



    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return x

net = Net(embeddingLength)

# define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters(), lr=0.0000009)

We convert our data to tensors and then train the model.

In [97]:
# convert the training data to tensors
X_train = torch.tensor(X, dtype=torch.float)
y_train = torch.tensor(y, dtype=torch.float)

In [98]:
def train (X, y ): 
    for epoch in range(3):  # loop over the dataset multiple times

        for i in range(len(X)):
            # get the inputs; data is a list of [inputs, labels]
            inputs = X[i]
            labels = y[i].long()
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    print('Finished Training')


In [None]:
train (X_train , y_train )

Finally we generate the submission file 

In [100]:
test_df = pd.read_csv("data/test_cleaned.txt")
X = [tweet.split()for tweet in test_df['text']]
y[y == -1] = 0

X= vectorizeTweets(X, w2v)
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float)

prediction = []
with torch.no_grad():
    for i in range(len(X)):
        inputs = X[i]
        outputs = net(inputs)
        prediction.append(outputs.argmax())

In [101]:
prediction = np.array(prediction)

In [102]:
prediction = pd.DataFrame(prediction, columns=["Prediction"])
prediction["Prediction"] = (prediction["Prediction"] *2)-1
prediction['Id']= prediction.index + 1
prediction=prediction.reindex(["Id","Prediction"],axis=1)
prediction

Unnamed: 0,Id,Prediction
0,1,-1
1,2,-1
2,3,-1
3,4,1
4,5,-1
...,...,...
9995,9996,1
9996,9997,-1
9997,9998,-1
9998,9999,1


In [103]:
prediction.to_csv("submission-{}.csv".format("nn-sg"),index=False)