In [14]:
######################################################
#Karuna Gujar
#CSCI 6350-001 Project #4
#Due: 02/23/20

#THis program uses the polarity and intensity of words to assign one of five ratings to 
#product reviews via a multinomial logistic regression classifier.
######################################################

import numpy as np
import random
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import opinion_lexicon
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
import pandas as pd 
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import preprocessing

file_name = 'reviews.txt'
train_file = 'reviews-train.txt'
test_file = 'reviews-test.txt'

test_train_div = 0.2
stop_words = set(stopwords.words('english'))

#lemmatizing
lemmatizer = WordNetLemmatizer()

### THis function reads the input file and  divides it into training and testing files for processing. 
### The test-train ratio is 20:80 
def read_input_file(filename):
    with open(filename, 'r', encoding='utf8') as file:
        lines = []
        category = []
        for line in file:

            if line != '\n':
                vals = line.split()
                category.append(vals[-1])
                line = ''
                for i in range(len(vals)-1):
                    line = line + ' ' + vals [i]
                lines.append(line)
    df_review_file = pd.DataFrame({'label':category, 'review':lines})
    
    
    Y = df_review_file.loc[:,'label'].values
    X = df_review_file.loc[:,'review'].values

    train_x, test_x, train_y, test_y = train_test_split(X,Y , test_size=test_train_div, random_state=0, stratify=Y)
    
    
    before_upsample = pd.DataFrame({'label':train_y, 'review':train_x})
    test_df = pd.DataFrame({'label':test_y, 'review':test_x})
       
    test_df.to_csv(test_file,sep="|")
    before_upsample.to_csv(train_file,sep="|")
    
    return before_upsample, test_df


train_df, test_df = read_input_file(file_name)


In [15]:

### THis function tokenizes and cleans the tokens by removing punctuation, 
### lemmatizig and converting into lower case.
def preprocess_and_tokenize( comment):
    
    tokens = word_tokenize(comment)
    # remove puntuation from each word
    table = str.maketrans('', '', string.punctuation)
    
    token_list = []
    
    for token in tokens:
        token = token.translate(table)

        #remove punctuations from non-aplhabetic words
        if(token.isalpha()):
            token = lemmatizer.lemmatize(token)
            token_list.append(token.lower())

    return token_list

### adding respective tokens to train and test dataframes
train_df["tokens"]=train_df.review.apply(lambda x:preprocess_and_tokenize(x))
test_df["tokens"]=test_df.review.apply(lambda x:preprocess_and_tokenize(x))

In [16]:
### Using frequency distribution to find 2000 most commonly seen unigrams in the reviews
all_train_words = [ x for item in train_df["tokens"].values.tolist() for x in item ]
all_train_words = nltk.FreqDist(all_train_words)
all_train_words = all_train_words.most_common(2000)
all_train_words = [x[0] for x in all_train_words]

### one-hot encoding for the unigrams in the reviews depending on the most common 2000 unigrams
def get_ohe( x, all_words):
    return [1 if word in x else 0 for word in all_words]

### Adding one-hot encoded unigrams to the test and train dataframes
train_df["unigrams_vec"] = train_df.tokens.apply(lambda x:get_ohe(x,all_train_words))
test_df["unigrams_vec"] = test_df.tokens.apply(lambda x:get_ohe(x,all_train_words))

In [17]:
### extracting bigrams
def get_bigrams( tokens):
    bigrm = nltk.bigrams(tokens)
    return [*map(' '.join, bigrm)]


train_df["bi_tokens"]=train_df.tokens.apply(lambda x:get_bigrams(x))
test_df["bi_tokens"]=test_df.tokens.apply(lambda x:get_bigrams(x))

### Using frequency distribution to find 1000 most commonly seen bigrams in the reviews
all_train_bigrams = [ x for item in train_df["bi_tokens"].values.tolist() for x in item ]
all_train_bigrams = nltk.FreqDist(all_train_bigrams)
all_train_bigrams = all_train_bigrams.most_common(1000)
all_train_bigrams = [x[0] for x in all_train_bigrams]

### one-hot encoding for the bigrams in the reviews depending on the most common 1000 bigrams
train_df["bigram_vec"] = train_df.bi_tokens.apply(lambda x:get_ohe(x,all_train_bigrams))
test_df["bigram_vec"] = test_df.bi_tokens.apply(lambda x:get_ohe(x,all_train_bigrams))

In [18]:
### combining unigrams and bigrams vectors
train_df["combined"] = train_df.apply(lambda x:x.unigrams_vec + x.bigram_vec,axis=1)
test_df["combined"] = test_df.apply(lambda x:x.unigrams_vec + x.bigram_vec,axis=1)

Y = train_df.label.values
X = train_df.combined.values.tolist()

ytest = test_df.label.values
xtest = test_df.combined.values.tolist()

In [20]:
### "Penalty" parameter is used for regularization. If "penalty" is not mentioned explicitly, 
### no regularization is applied. "l2" is a type of regularization. 
### The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties

### The "C" parameter in the LogisticRegression function is used to decide the strength of regularization. 
### It is inverse of regularization strength and must be a positive float. 
### Smaller values of C specify stronger regularization.

### "Class weight" pamater is used to decide how much weight to be given to each class.
### The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to 
### class frequencies in the input data. Since the reviews data we had, was very unbiased, 
### I decided to use this parameter.

for mclass in ('multinomial', 'ovr'):
    lr = LogisticRegression(solver='lbfgs', max_iter=100, C=0.01,
                            random_state=0, multi_class=mclass, \
                            dual=False,penalty="l2",class_weight="balanced").fit(X, Y)
    yhat = lr.predict(xtest)

    print(mclass,'Accuracy: ',accuracy_score(ytest, yhat))
    print(mclass,'Classification Report: \n',classification_report(ytest, yhat))
    print(mclass,'Classification Report: \n',confusion_matrix(ytest, yhat))

multinomial Accuracy:  0.576
multinomial Classification Report: 
               precision    recall  f1-score   support

           1       0.27      0.40      0.32        10
           2       0.14      0.27      0.18        15
           3       0.13      0.26      0.18        34
           4       0.35      0.36      0.36        99
           5       0.82      0.69      0.75       342

    accuracy                           0.58       500
   macro avg       0.34      0.40      0.36       500
weighted avg       0.65      0.58      0.61       500

multinomial Classification Report: 
 [[  4   1   4   0   1]
 [  1   4   5   3   2]
 [  1   5   9  10   9]
 [  1   7  15  36  40]
 [  8  12  34  53 235]]
ovr Accuracy:  0.606
ovr Classification Report: 
               precision    recall  f1-score   support

           1       0.29      0.20      0.24        10
           2       0.12      0.13      0.12        15
           3       0.18      0.29      0.22        34
           4       0.36  

1.	Selection of features: I am using 2000 most commonly used unigrams and 1000 most commonly used bigrams as the features. 
I started with 
    a.	total length of the review
    b.	presence of number of positive words and negative words in the reviews
    c.	c. most common top 2000 words
and observed that the length of the review was not very helpful. Removing that feature did not cause any loss in the accuracy.
Another observation was that positive and negative words did not help because the reference in which these (positive or negative) words were used was equally important, merely getting the counts wasn’t sufficient. Funny thing, the review with rating 5 sometimes had a greater number of negative words than positive. Hence this feature wasn’t helpful. 
So I decided to stick to only the commonly used words feature. To make it more precise I analyzed using both unigrams and bigrams. Another interesting thing I noticed was, not removing the stop words actually bumped up the accuracy. 
Also, I observed that the data given was biased or unsymmetrical, that is, the number of data points for rating 5 outsized the number of data points for rating 1 or 2 or 3. So I looked into data imbalance and found that the solution to fix it is upsclaing the inferiror classes. I worked on it for a while and realized that did not improve the performance much if i considered threshhold as 10% of supperior class (required threshold to be 60%). Later found about the "class_weight" parameter in logistic regression fumction which is used to assign weights to the classes depending on the balance. "Class weight" pamater is used to decide how much weight to be given to each class. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data. This dropped the accuracy but gave better precision, recall and F1 values.

2.	I got better accuracy and precision with ovr than multinomial. However, recall and F1 score is better with multinomial. 

3.	Accuracy is ratio of correctly predicted observation to the total observations. Accuracy is a great measure  to gauge the performance but only with symmetric datasets where values of false positive and false negatives are almost same. I got 58% accuracy with Multinomial classification and 60% with ovr. However, the reviews data given was highly unsymmetrical. Thus looking at other metrics is important to gauge the performance. Precision is the ratio of correctly predicted positive observations to the total predicted positive observations. Precision for class 5 is better than others, in both multinomial and ovr. The reason being, there were lot of data samples for class 5 resulting in the model trained better for review 5 than others.
Recall is the ratio of correctly predicted positive observations to the all observations in actual class - yes. Can also be termed as sensitivity. F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. F1 is usually more useful than accuracy, especially in case of uneven class distribution. I got F1 0.75 for multinomial and 0.77 for ovr for class 5.
A confusion matrix shows the combination of the actual and predicted classes. Each row of the matrix represents the instances in a predicted class, while each column represents the instances in an actual class. It is a good measure of whether models can account for the overlap in class properties and understand which classes are most easily confused.



REGULARIZATION

SkLearn uses parameter “Penalty” in the logisticRegression function to indicate regularization. "Penalty" parameter is used for regularization. If "penalty" is not mentioned explicitly, no regularization is applied. "l2" is a type of regularization. 
The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. The "C" parameter in the LogisticRegression function is used to decide the strength of regularization. It is inverse of regularization strength and must be a positive float. 
Smaller values of C specify stronger regularization. So I selected C=0.01.

Regularization can be used to avoid overfitting. Regularization can be used to train models that generalize better on unseen data, by preventing the algorithm from overfitting the training dataset. 

Accuracy increased from 58% to 59% in multinomial without regularizing, where as it dropped in ovr from 60% to 59%. More importantly, precision and F1 score dropped for both ovr and multinomial without regularizing.

Following is the output without regularization: 


multinomial Accuracy:  0.596
multinomial Classification Report: 
               precision    recall  f1-score   support

           1       0.50      0.20      0.29        10
           2       0.08      0.07      0.07        15
           3       0.17      0.21      0.18        34
           4       0.30      0.32      0.31        99
           5       0.77      0.75      0.76       342

    accuracy                           0.60       500
   macro avg       0.36      0.31      0.32       500
weighted avg       0.61      0.60      0.60       500

multinomial Classification Report: 
 [[  2   1   4   1   2]
 [  0   1   5   4   5]
 [  0   3   7  12  12]
 [  0   1   7  32  59]
 [  2   7  19  58 256]]
ovr Accuracy:  0.594
ovr Classification Report: 
               precision    recall  f1-score   support

           1       0.50      0.10      0.17        10
           2       0.12      0.07      0.09        15
           3       0.14      0.15      0.14        34
           4       0.29      0.32      0.30        99
           5       0.76      0.75      0.76       342

    accuracy                           0.59       500
   macro avg       0.36      0.28      0.29       500
weighted avg       0.60      0.59      0.59       500

ovr Classification Report: 
 [[  1   1   3   3   2]
 [  0   1   5   3   6]
 [  0   3   5  11  15]
 [  0   0   7  32  60]
 [  1   3  17  63 258]]



FOllowing is the code that is used for regularizing:

lr = LogisticRegression(solver='lbfgs', max_iter=100, C=0.01,
                            random_state=0, multi_class=mclass, \
                            dual=False,penalty="l2",class_weight="balanced").fit(X, Y)

