# THIRD ASSIGNMENT

In [1]:
import nltk
import numpy as np 
import pandas as pd 
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
import re
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing

**IMPORT THE DATA**

In [2]:
df = pd.read_csv('sentiment.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**DATA CLEANING**

In [4]:
stop_words =stopwords.words('english')
def clean_data(text):
    text = re.sub(r'<br />', ' ', text) #Removes Html tag
    text = re.sub(r'[^\ a-zA-Z0-9]+', '', text)  #Removes non alphanumeric
    text = re.sub(r'^\s*|\s\s*', ' ', text).strip() #Removes extra whitespace, tabs
    stop_words = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
    text = text.lower().split() #Converts text to lowercase
    cleaned_text = list()
    for word in text:        
        if word in stop_words:    #Removes Stopwords, i.e words that don't convey any meaningful context/sentiments
            continue    
        word = lemmatizer.lemmatize(word, pos = 'v')    #Lemmatize words, pos = verbs, i.e playing, played becomes play
        cleaned_text.append(word)
    text = ' '.join(cleaned_text)
    return text

df['review'] = df['review'].apply(lambda x: clean_data(x))
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mention watch 1 oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically theres family little boy jake think ...,negative
4,petter matteis love time money visually stun f...,positive


In [5]:
df['review']

0        one reviewers mention watch 1 oz episode youll...
1        wonderful little production film technique una...
2        think wonderful way spend time hot summer week...
3        basically theres family little boy jake think ...
4        petter matteis love time money visually stun f...
                               ...                        
49995    think movie right good job wasnt creative orig...
49996    bad plot bad dialogue bad act idiotic direct a...
49997    catholic teach parochial elementary school nun...
49998    im go disagree previous comment side maltin on...
49999    one expect star trek movies high art fan expec...
Name: review, Length: 50000, dtype: object

**CONVERTING CATEGORICAL FEATURES TO LABELS**

In [6]:
le = preprocessing.LabelEncoder()
Y_train_label = le.fit_transform(df['sentiment'])

# 1. ONE-LAYER MLP

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import gensim.downloader as api
import torch.nn as nn

**TOKENIZING THE DATA**

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['review'])

**CONVERTING TEXT TO SEQUENCES**

In [9]:
encoded_docs = tokenizer.texts_to_sequences(df['review'])

In [10]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

153603

**APPLY PADDING**

In [11]:
max_length = max([len(s.split()) for s in df['review']])
X_new= pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [12]:
max_length

1437

In [13]:
X_new.shape

(50000, 1437)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_new, Y_train_label, test_size=0.2, random_state=42)

In [15]:
X_test.shape

(10000, 1437)

**CONVERT ARRAYS TO TENSORS**

In [16]:
#create Tensor Dataset
train_data=TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
test_data=TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))

#dataloader
batch_size=64
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

**LOAD PRE-TRAINED WORD2VEC**

In [17]:
## Load pretrained word2vec

wv = api.load('word2vec-google-news-300')

In [18]:
pretrained_values=np.zeros((vocab_size,300))
for key,value in tokenizer.word_index.items():
    if key in wv:
        pretrained_values[value,:]=wv[key]
    else:
        continue

**ONE-LAYER MLP WITH AN EMBEDDING LAYER**

In [21]:

class MLP_ONE(nn.Module):
    def __init__(self, pretrained_values,input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.embedding.load_state_dict({'weight': pretrained_values})
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.fc = nn.Dropout(0.5)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embeds=embedded.sum(dim=1)
        out=self.fc(embeds)
        out=torch.sigmoid(out)
        return out


In [22]:
pretrained_values=torch.Tensor(pretrained_values)

**DIMENSIONS**

In [23]:
n_vocab = vocab_size
n_embed = 300
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
net = MLP_ONE(pretrained_values,n_vocab, n_embed, n_hidden, n_output)

**OPTIMIZER, LOSS FUNCTION AND LEARNING RATE**

In [24]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

**ACCURACY**

In [25]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(preds) # 0.75 --> 1 0.4 --> 0
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

**TRAINING**

In [26]:
def train1(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #Train mode is on
    for batchX,batchy in iterator:
        batchX.requires_grad = True
        optimizer.zero_grad() #Reset the gradients
        predictions= model(batchX.long()).squeeze(1) ## forward propagation
        loss = criterion(predictions, batchy)
        loss.backward() ## backward propagation / calculate gradients
        optimizer.step() ## update parameters
        acc = binary_accuracy(predictions, batchy)
        epoch_loss += loss.item()
        epoch_acc += acc.item()   
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


#train1(net, train_loader, optimizer, criterion)

**EVALUATION**

In [27]:
def evaluate1(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() #Evaluation mode is on
    with torch.no_grad():
        for batchX,batchy in iterator:
            predictions = model(batchX.long()).squeeze(1) 
            loss = criterion(predictions, batchy)
            acc = binary_accuracy(predictions, batchy)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


#evaluate1(net, test_loader, criterion)

**TRAINING THE MODEL FOR 5 EPOCHS**

In [28]:
N_EPOCHS = 5
import time
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train1(net, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate1(net, test_loader, criterion)
    
    
    end_time = time.time()

    
    print(f'\tEpoch: {epoch+1:02} | Epoch Time: {end_time-start_time:.2f}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')
    print('\n')

	Epoch: 01 | Epoch Time: 516.22
	Train Loss: 0.402 | Train Acc: 84.27%
	Test Loss: 0.255 |  Test. Acc: 90.15%


	Epoch: 02 | Epoch Time: 582.42
	Train Loss: 0.184 | Train Acc: 92.81%
	Test Loss: 0.283 |  Test. Acc: 88.88%


	Epoch: 03 | Epoch Time: 581.82
	Train Loss: 0.126 | Train Acc: 95.09%
	Test Loss: 0.314 |  Test. Acc: 89.62%


	Epoch: 04 | Epoch Time: 616.86
	Train Loss: 0.075 | Train Acc: 97.21%
	Test Loss: 0.449 |  Test. Acc: 87.18%


	Epoch: 05 | Epoch Time: 654.10
	Train Loss: 0.068 | Train Acc: 97.41%
	Test Loss: 0.550 |  Test. Acc: 87.59%




# 2. TWO-LAYER MLP USING TF-IDF

**CONVERTING WORDS TO VECTORS**

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=100) #Create a tf-idf vectorizer
vectorizer.fit(df['review']) #Fit the vectorizer on the data by mapping the words onto the index values. 
#print(vectorizer.vocabulary_)
#print(vectorizer.idf_)

X = vectorizer.transform(df['review']) #Build a tf-idf based feature vector for the first document (the first sentence).
#print(type(vector))
#print(vector.toarray())

**CONVERTING CATEGORICAL FEATURES TO LABELS**

In [30]:
le2 = preprocessing.LabelEncoder()
Y_train_label2 = le.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mention watch 1 oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically theres family little boy jake think ...,negative
4,petter matteis love time money visually stun f...,positive


**SPLIT DATA RANDOMLY**

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y_train_label2, test_size=0.2, random_state=42)

In [32]:
from collections import Counter
Counter(y_train)

Counter({0: 20039, 1: 19961})

In [33]:
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

**CONVERT DATA TO TENSORS**

In [34]:
import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.FloatTensor(X_train.toarray()), torch.FloatTensor(y_train))
test_data=TensorDataset(torch.FloatTensor(X_test.toarray()), torch.FloatTensor(y_test))

#dataloader
batch_size=64
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [35]:
a,b=iter(train_loader).next() 
"""
It calls the __iter__() method on the iterable, and then calls __next__() on the returned iterator until it reaches 
the end of the iterator
"""

'\nIt calls the __iter__() method on the iterable, and then calls __next__() on the returned iterator until it reaches \nthe end of the iterator\n'

**TWO-LAYER MLP**

In [36]:
class MultilayerPerceptron2(nn.Module):
    def __init__(self, input_size,hidden_size):
    # Call initializer function of the super class 
        super(MultilayerPerceptron2, self).__init__()
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        #whenever this model is called, those layers in the sequential block 
        #will be processed in the order given to the block. 
        self.model = nn.Sequential(
            nn.Linear(self.input_size,self.hidden_size ),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_size, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x): 
        output = self.model(x) #call the model defined above for forward propagation. 
        return output

In [37]:
input_size=X_train.shape[1]
hidden_size=100  ## common values: multiple of a power of 2 
net2 = MultilayerPerceptron2(input_size,hidden_size)
print(net2)

MultilayerPerceptron2(
  (model): Sequential(
    (0): Linear(in_features=5283, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=100, out_features=1, bias=True)
    (4): Sigmoid()
  )
)


**OPTIMIZER, LOSS FUNCTION, LEARNING RATE**

In [38]:
#optimizer = optim.SGD(net.parameters(), lr=1e-3)
#criterion = nn.BCEWithLogitsLoss()

criterion2 = nn.BCELoss()
optimizer2 = torch.optim.Adam(net2.parameters(), lr=0.001)

**ACCURACY**

In [39]:
def binary_accuracy2(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(preds) # 0.75 --> 1 0.4 --> 0
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

**TRAINING**

In [40]:
def train2(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #Train mode is on
    for batchX,batchy in iterator:
        batchX.requires_grad = True
        optimizer.zero_grad() #Reset the gradients
        predictions= model(batchX).squeeze(1) ## forward propagation
        loss = criterion2(predictions, batchy)
        loss.backward() ## backward propagation / calculate gradients
        optimizer.step() ## update parameters
        
        acc = binary_accuracy2(predictions, batchy)
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


train2(net2, train_loader, optimizer2, criterion2)

(0.36765016741752626, 0.8633)

**EVALUATION**

In [41]:
def evaluate2(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() #Evaluation mode is on
    with torch.no_grad():
        for batchX,batchy in iterator:
            predictions = model(batchX).squeeze(1) 
            loss = criterion2(predictions, batchy)
            acc = binary_accuracy2(predictions, batchy)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


evaluate2(net2, test_loader, criterion2)

(0.2600648891014658, 0.8908240445859873)

In [42]:
import time

**TRAIN THE MODEL FOR 5 EPOCHS**

In [43]:
N_EPOCHS = 5

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train2(net2, train_loader, optimizer2, criterion2)
    test_loss, test_acc = evaluate2(net2, test_loader, criterion2)
    
    end_time = time.time()

    
    ##if valid_loss < best_valid_loss:
    #    best_valid_loss = valid_loss
    #    torch.save(model.state_dict(), 'tut1-model.pt')  ##
    
    print(f'\tEpoch: {epoch+1:02} | Epoch Time: {end_time-start_time:.2f}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')
    print('\n')

	Epoch: 01 | Epoch Time: 7.86
	Train Loss: 0.236 | Train Acc: 90.70%
	Test Loss: 0.256 |  Test. Acc: 89.02%


	Epoch: 02 | Epoch Time: 6.91
	Train Loss: 0.210 | Train Acc: 91.76%
	Test Loss: 0.262 |  Test. Acc: 88.95%


	Epoch: 03 | Epoch Time: 6.92
	Train Loss: 0.193 | Train Acc: 92.47%
	Test Loss: 0.271 |  Test. Acc: 88.62%


	Epoch: 04 | Epoch Time: 6.39
	Train Loss: 0.177 | Train Acc: 93.19%
	Test Loss: 0.282 |  Test. Acc: 88.33%


	Epoch: 05 | Epoch Time: 6.46
	Train Loss: 0.161 | Train Acc: 93.77%
	Test Loss: 0.292 |  Test. Acc: 88.33%




# 3. RANDOM FOREST CLASSIFIER

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)

In [46]:
print('Train Accuracy: ')
clf.score(X_train,y_train)

Train Accuracy: 


1.0

In [47]:
print('Test Accuracy: ')
clf.score(X_test,y_test)

Test Accuracy: 


0.8543

# 4. LOGISTIC REGRESSION CLASSIFIER

In [48]:
from sklearn.linear_model import LogisticRegression

In [49]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [50]:
print('Train Accuracy: ')
clf.score(X_train,y_train)

Train Accuracy: 


0.913825

In [51]:
print('Test Accuracy: ')
clf.score(X_test,y_test)

Test Accuracy: 


0.891