In [18]:
import numpy as np
import pandas as pd
train_file = "/Users/krishna/workspace/python/CS584/HW1_(584)_Movie_Review_Classification/train.dat"
test_file = "/Users/krishna/workspace/python/CS584/HW1_(584)_Movie_Review_Classification/test.dat"

train_array = np.genfromtxt(train_file, usecols=(0, 1), dtype=None, encoding="UTF-8", delimiter='\t')
train_data = pd.DataFrame(train_array)
train_data.columns = ["Sentiment", "Review"]


test_array = np.genfromtxt(test_file, usecols=(0), encoding="UTF-8", dtype=None, delimiter="#EOL")
test_data = pd.DataFrame(test_array)
test_data.columns = ["Review"]


In [3]:
train_data.head()

Unnamed: 0,Sentiment,Review
0,1,One of my all-time favorite so-laughably-lousy...
1,-1,"I had high hopes for this film, because I thou..."
2,-1,"When this was released, I thought this was one..."
3,-1,I just watched this movie on Starz. Let me go ...
4,1,I loved it so much that I bought the DVD and t...


In [4]:
test_data.head()

Unnamed: 0,Review
0,"This is a very low budget film, set in one loc..."
1,One minute into THE UNTOLD and it`s already ri...
2,I recently purchased this on DVD as I hadn't h...
3,Some people have the ability to use only 3 neu...
4,"As I've said in the title of this review, It p..."


<h2>Now we'll clean & then pre-process the text data. </h2>

In [9]:
import re
import nltk
import unicodedata
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
nltk.download('stopwords')
nltk.download('wordnet')

def pre_process(review):
    # 1. Remove HTML tags
    text_only = BeautifulSoup(review, features="html.parser").get_text() 
    
    # 2. Removing Accented Text
    new_text = unicodedata.normalize('NFKD', text_only).encode('ascii', 'ignore').decode('UTF-8', 'ignore')
    
    # 3. Remove Email IDs, URLs and numbers
    noEmail = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',new_text)
    
    noUrl = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', noEmail)
    
    # 4. Tokenize
    toktok = ToktokTokenizer()
    words= toktok.tokenize(noUrl)
    
    # 5. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  

    # 6. Remove stop words and also 3-letter words and Lemmatize the review
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = ''
    lemmatized_words = [str(lemmatizer.lemmatize(word)) + ' ' for word in words if word not in stops and len(word) > 3]
    
    return "".join(lemmatized_words)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/krishna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<h2> Creating a matrix of word counts using TF-IDF Vectorization </h2>

In [10]:
def create_vector(train_data, test_data):
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(norm = 'l2', lowercase = True, min_df = 0, 
                                       use_idf = True, smooth_idf = False, sublinear_tf = True, \
                                       ngram_range=(1,2))
    train_vector = tfidf_vectorizer.fit_transform(train_data)

    test_vector = tfidf_vectorizer.transform(test_data)
    
    return train_vector, test_vector

<h2> Finding Similarities using Cosine Similarities</h2>

In [11]:
def find_similarity(train_vector, test_vector):
    from sklearn.metrics.pairwise import cosine_similarity
    csr_sim = cosine_similarity(test_vector,train_vector)
    return csr_sim

<h2> Finding Nearest Neighbours and Predicting the Sentiment. KNN </h2>

In [12]:
def predict(similarities, labels, k):
    
    # First let us find the nearest neighbours
    k = k
    predictions = list()

    for similarity in similarities:
        nearest_neighbours = np.argsort(-similarity)[:k] # Here we are sorting & taking the indices
        
        # Now Check the sentiments of these neighbours
        positive_count = 0
        negative_count = 0
        for neighbor in nearest_neighbours:
            if int(labels[neighbor]) == 1:
                positive_count += 1
            else:
                negative_count += 1
                
        # Now we'll consider the majority vote
        if positive_count > negative_count:
            predictions.append(1)
        else:
            predictions.append(-1)
            
    return predictions

In [13]:
# def iter_row(df, col):
#     for i, row in df.iterrows():
#         val = row[col]
#         df.at[i, col] = pre_process(val)
# import time
# start_time = time.clock()
# iter_row(train_data, "Review")
# end_time = time.clock()

<h2>Use this when predicting an unseen test data</h2>

In [16]:
# Use this when predicting an unseen test file

#### Preprocessing
%timeit train_data["Review"] = train_data["Review"].apply(pre_process)
%timeit test_data["Review"] = test_data["Review"].apply(pre_process)

train_vector, test_vector = create_vector(train_data["Review"], test_data["Review"])
similarities = find_similarity(train_vector, test_vector)
predictions = predict(similarities, train_data["Sentiment"], k=250)

19.6 s ± 61.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
19.4 s ± 454 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<h2>Use this to Measure the accuracy for known test data</h2>

In [None]:
# K Fold Cross Validation (Used for measuring)

# import copy 
# accuracy_list = []
# k_fold = 5
# data_set = copy.deepcopy(train_array)
# for _ in range(k_fold):
#     fold_size = int((len(data_set) / k_fold))
    
    
#     new_train_array = data_set[fold_size:]
#     new_train_data = pd.DataFrame(new_train_array)
#     new_train_data.columns = ["Sentiment", "Review"]
    
#     new_test_array = data_set[:fold_size]
#     new_test_data = pd.DataFrame(new_test_array)
#     new_test_data.columns = ["Sentiment", "Review"]

#### Preprocessing
# #     new_train_data["Review"] = new_train_data["Review"].apply(pre_process)
# #     new_test_data["Review"] = new_test_data["Review"].apply(pre_process)


#     train_vector, test_vector = create_vector(new_train_data["Review"], new_test_data["Review"])


#     similarities = find_similarity(train_vector, test_vector)

#     predictions = predict(similarities, new_train_data["Sentiment"], k=250)

#     from sklearn import metrics
#     #     print(metrics.confusion_matrix(new_test_data["Sentiment"], predictions))
#     accuracy = metrics.accuracy_score(new_test_data["Sentiment"], predictions)
#     print("Accuracy for %s is %s" %(_, accuracy))
#     accuracy_list.append(accuracy)
    
#     data_set = np.delete(data_set, slice(0, fold_size), axis=0)
#     data_set = np.concatenate([new_train_array, new_test_array])
    
# print(accuracy_list)
# np.mean(accuracy_list)

In [None]:
#Method to write to file
with open('/Users/krishna/Downloads/Fall_21/CS_584_DM/format.dat', 'w') as log:
    for x in predictions:
        log.write(str(x)+'\n')
