# Step 1: Data Preprocessing and Feature Selection

## 1.a Importing essential libraries and viewing the datasets

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import statistics
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


train_data = pd.read_csv("1598639150_466036_train_file.txt", sep="\t")
print(train_data)

test_data = pd.read_csv("1598639150_4984343_test_file.txt", header=None, names=["review"])
print(test_data)

       sentiment                                             review
0              1  One of the other reviewers has mentioned that ...
1              1  A wonderful little production. <br /><br />The...
2              1  I thought this was a wonderful way to spend ti...
3             -1  Basically there's a family where a little boy ...
4              1  Petter Mattei's "Love in the Time of Money" is...
...          ...                                                ...
14994          1  *** out of ****<br /><br />Yep! Dressed To Kil...
14995         -1  Bobcat Goldthwait should be commended for atte...
14996          1  And it's not because since her days on "Claris...
14997         -1  A traveling couple (Horton and Hamilton)stumbl...
14998         -1  This film is deeply disappointing. Not only th...

[14999 rows x 2 columns]
                                                  review
0      This film should have never been made. Honestl...
1      This movie was bad from the start. Th

## 1.b Ensuring test dataset has expected number of entries i.e. 15000 

In [2]:
print("Before preprocessing the test set: size = ", test_data.size)
f = open("1598639150_4984343_test_file.txt", 'r', encoding="utf8")
txt = f.read().split('\n')
txt = list(filter(None, txt))
test_data = pd.DataFrame(txt, columns=['review'])
print("After preprocessing the test set: size = ", test_data.size)

Before preprocessing the test set: size =  14992
After preprocessing the test set: size =  15000


## 1.c Check for null values in dataset

In [3]:
print("Number of null values in train set:\n",train_data.isnull().sum())
print("\n\nNumber of null values in test set:\n",test_data.isnull().sum())

Number of null values in train set:
 sentiment    0
review       0
dtype: int64


Number of null values in test set:
 review    0
dtype: int64


## 1.d Dataset cleaning

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haramrit09k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/haramrit09k/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
stopwords = stopwords.words('english')
stopwords.append('br') # to remove br tags 

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
# Convert everything to lower case
train_data['review'] = train_data['review'].str.lower()
# Remove all characters that are not alphabets
train_data['review'] = train_data['review'].str.replace('[^a-zA-Z ]', ' ')
# Lemmatization, removal of stopwords and only accepting words with length greater than equal to 2
train_data['review'] = train_data['review'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stopwords and len(word) > 1]))

In [8]:
# Convert everything to lower case
test_data['review'] = test_data['review'].str.lower()
# Remove all characters that are not alphabets
test_data['review'] = test_data['review'].str.replace('[^a-zA-Z ]', ' ')
# Lemmatization, removal of stopwords and only accepting words with length greater than equal to 2
test_data['review'] = test_data['review'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stopwords and len(word) > 1]))

In [9]:
# Viewing first 10 elements of train dataset to confirm the cleaning process
print(train_data['review'][:10])
# Viewing first 10 elements of test dataset to confirm the cleaning process
print(test_data['review'][:10])

0    one reviewer mentioned watching oz episode hoo...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically family little boy jake think zombie ...
4    petter mattei love time money visually stunnin...
5    probably time favorite movie story selflessnes...
6    sure would like see resurrection dated seahunt...
7    show amazing fresh innovative idea first aired...
8    encouraged positive comment film looking forwa...
9    like original gut wrenching laughter like movi...
Name: review, dtype: object
0    film never made honestly must admit saw seriou...
1    movie bad start purpose movie angela wanted ge...
2    god never felt insulted whole life crap many w...
3    fan coen brother george clooney anyone see ske...
4    movie andaz apna apna book top intelligent com...
5    say really looking forward watching film findi...
6    film powerfully demonstrates struggle two woma...
7    first minute movie make fun sequ

## 1.e Perform TFIDF vectorization of the datasets

In [10]:
# initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1,3), max_features = 75000)
# fit and transform the vectorizer on the training dataset
vectorizer.fit(train_data['review'])
train_vector = vectorizer.transform(train_data['review'])
# transform the vectorizer on the testing dataset
test_vector = vectorizer.transform(test_data['review'])

In [11]:
X_train = train_vector
X_test = test_vector
y_train = train_data['sentiment']

print(X_train.shape, X_test.shape, y_train.shape)

(14999, 75000) (15000, 75000) (14999,)


In [12]:
# converted y_train to numpy array to resolve error I faced while calculating the cosine similarity
y_train = y_train.to_numpy()
print(y_train.shape)
print(type(y_train))

(14999,)
<class 'numpy.ndarray'>


# Step 2: Implementing the KNN algorithm

## 2.a Calculating cosine similarity between X_train and X_test

In [13]:
cos_sim = cosine_similarity(X_train, X_test)
cos_sim = cos_sim.transpose() #transpose operation to have the test set elements as rows for easier iteration
print(type(cos_sim))
print(cos_sim.shape)
print(cos_sim)

<class 'numpy.ndarray'>
(15000, 14999)
[[0.01886262 0.01033139 0.02266331 ... 0.00926203 0.02210489 0.02098045]
 [0.02660679 0.01075998 0.0016965  ... 0.0114348  0.00502515 0.01075864]
 [0.0281239  0.03783957 0.02371897 ... 0.00909485 0.02239779 0.02775007]
 ...
 [0.01195106 0.06832874 0.01023805 ... 0.00485011 0.03405178 0.01403564]
 [0.03929119 0.02227215 0.01832944 ... 0.03110629 0.00766168 0.03120383]
 [0.00903945 0.02842507 0.03901048 ... 0.01157718 0.00698245 0.02970456]]


## 2.b Find the k nearest neighbours from the cosine similarity matrix for each test item

In [29]:
indices = list()
k = 299 # Best value of k chosen after experimenting
for doc in cos_sim:
    indices.append(np.argsort(doc)[-k:])

In [30]:
# Print length of an item in indices list to ensure it has k values
print(len(indices[0]))

299


## 2.c Find predicted value of each test item using majority voting algorithm

In [31]:
pred_val = list()
for arr in indices:
    pred_val.append(mode([y_train[i] for i in arr]))

In [32]:
# Printing length of pred_val to ensure correct number of test set items were predicted i.e. 15000
print(len(pred_val))

15000


## 2.d Write predicted values to a text file in the requested format

In [33]:
with open('pred_values.txt', 'w') as filehandle:
    for listitem in pred_val:
        filehandle.write('%s\n' % listitem)
    print("Wrote",len(pred_val),"items successfully to pred_values.txt")

Wrote 15000 items successfully to pred_values.txt
