### M logreg , 3000 features , pipeline

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [2]:
# Importing Basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, f1_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
import pickle

In [3]:
# Libraries for text preprocessing
import re
import nltk

#nltk.download('stopwords')
from nltk.corpus import stopwords # import stopwords 

#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer # to lemmatize the words

from nltk.tokenize import RegexpTokenizer, word_tokenize # to tokenize sentences into words

### Loading Train dataset

In [4]:
data = pd.read_csv("train.csv")

In [5]:
data.shape
data.head(3) # (14343, 3)

(14343, 3)

Unnamed: 0,ID,Review,Rating
0,0,exceptional service nice all-around daughter s...,5
1,1,beautiful relaxing jw marriott desert ridge re...,5
2,2,great location great location 5 mins subway ta...,5


## Preprocessing of train data

In [6]:
# Replacing contractions (shorter vesions of words) with full word (n't , 'nt  as not)
for index, row in data.iterrows():
    phrase = row['Review']
    phrase = phrase.lower() #Convert to lowercase
    phrase = re.sub(r"n\'t", " not", phrase) # replace n't  with not
    phrase = re.sub(r"\'nt", " not", phrase) # replace 'nt  with not 
    data.at[index, 'Review_2'] = phrase
data[0:3]

Unnamed: 0,ID,Review,Rating,Review_2
0,0,exceptional service nice all-around daughter s...,5,exceptional service nice all-around daughter s...
1,1,beautiful relaxing jw marriott desert ridge re...,5,beautiful relaxing jw marriott desert ridge re...
2,2,great location great location 5 mins subway ta...,5,great location great location 5 mins subway ta...


In [7]:
# TEXT NORMALIZATION   or PRE-PROCESSING

##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english")) -set(["not"])
##Creating a list of custom stopwords
new_words = ['n',"th","hotel",'hotels','room','rooms','restaurant','restaurants','e','ca','nd','wo','el','etc']
stop_words = stop_words.union(new_words)

for index, row in data.iterrows():
    text = re.sub('[^a-zA-Z]', ' ', row['Review_2']) # Remove punctuations    
    # text = text.lower() #Convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)  # remove tags
    text = re.sub("(\\d|\\W)+"," ",text)  # remove special characters and digits
    text = text.split()  # Convert to list from string
    
    #Lemmatisation
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if not word  in stop_words] 
    text_string = " ".join(text)
    data.at[index, 'Clean_review'] = text_string
data.head(3)

Unnamed: 0,ID,Review,Rating,Review_2,Clean_review
0,0,exceptional service nice all-around daughter s...,5,exceptional service nice all-around daughter s...,exceptional service nice around daughter staye...
1,1,beautiful relaxing jw marriott desert ridge re...,5,beautiful relaxing jw marriott desert ridge re...,beautiful relaxing jw marriott desert ridge re...
2,2,great location great location 5 mins subway ta...,5,great location great location 5 mins subway ta...,great location great location min subway take ...


## Training using 3 Rating levels 

In [8]:
data['Rating_3levels'] = None
for index, rows in data.iterrows():
    if(rows['Rating'] >= 1 and rows['Rating'] < 3):
        data.at[index, 'Rating_3levels'] = 1
    elif(rows['Rating'] == 3):
        data.at[index,'Rating_3levels'] = 2
    elif(rows['Rating'] > 3 and rows['Rating'] <= 5 ):
        data.at[index,'Rating_3levels'] = 3
data.head()

Unnamed: 0,ID,Review,Rating,Review_2,Clean_review,Rating_3levels
0,0,exceptional service nice all-around daughter s...,5,exceptional service nice all-around daughter s...,exceptional service nice around daughter staye...,3
1,1,beautiful relaxing jw marriott desert ridge re...,5,beautiful relaxing jw marriott desert ridge re...,beautiful relaxing jw marriott desert ridge re...,3
2,2,great location great location 5 mins subway ta...,5,great location great location 5 mins subway ta...,great location great location min subway take ...,3
3,3,"pleased nice safe hotel, flower market hotel v...",3,"pleased nice safe hotel, flower market hotel v...",pleased nice safe flower market vast array mai...,2
4,4,excellent hotel service great hotel excellent ...,4,excellent hotel service great hotel excellent ...,excellent service great excellent location cou...,3


#### Target variable

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14343 entries, 0 to 14342
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ID              14343 non-null  int64 
 1   Review          14343 non-null  object
 2   Rating          14343 non-null  int64 
 3   Review_2        14343 non-null  object
 4   Clean_review    14343 non-null  object
 5   Rating_3levels  14343 non-null  object
dtypes: int64(2), object(4)
memory usage: 672.5+ KB


In [10]:
data['Rating_3levels'] = data.Rating_3levels.astype(int)


In [11]:
y = data.Rating_3levels ; y.shape ; y[0:5]

(14343,)

0    3
1    3
2    3
3    2
4    3
Name: Rating_3levels, dtype: int32

In [12]:
# # Splitting dataset in 80:20 ratio
# frpm previous models. found this ratio to give good results
X_train1,X_test1,y_train1,y_test1 = train_test_split(data.Clean_review,y,test_size=0.20,random_state=123,stratify=y)
X_train1.shape # (11474, 3006)
X_test1.shape # (2869, 3006)
y.shape  # 14343 reviews

(11474,)

(2869,)

(14343,)

### Tfidf and Multinomial log reg using pipeline (3000 features)

In [13]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps = [('tfidf',TfidfVectorizer(max_features=3000)),
                            ('model',LogisticRegression(random_state=123, multi_class='multinomial',solver='newton-cg'))])

### Fitting pipeline for full train data and predicting for same

In [14]:
pipeline.fit(data.Clean_review, y)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=3000)),
                ('model',
                 LogisticRegression(multi_class='multinomial', random_state=123,
                                    solver='newton-cg'))])

In [15]:
# for full data
y_pred = pipeline.predict(data.Clean_review)

print("Accuracy score on train data:" ,accuracy_score(y,y_pred))
print("F1 macro score on train data:",f1_score(y,y_pred, average='macro'))
print(classification_report(y,y_pred))
print(confusion_matrix(y,y_pred))

Accuracy score on train data: 0.9018336470752283
F1 macro score on train data: 0.7679944913674174
              precision    recall  f1-score   support

           1       0.87      0.86      0.87      2225
           2       0.81      0.35      0.49      1510
           3       0.91      0.99      0.95     10608

    accuracy                           0.90     14343
   macro avg       0.87      0.73      0.77     14343
weighted avg       0.90      0.90      0.89     14343

[[ 1924    59   242]
 [  225   525   760]
 [   60    62 10486]]


### Fitting pipeline for train data and checking on validation data

In [16]:
pipeline.fit(X_train1,y_train1)

Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=3000)),
                ('model',
                 LogisticRegression(multi_class='multinomial', random_state=123,
                                    solver='newton-cg'))])

In [17]:
pipeline.score(X_train1,y_train1)
pipeline.score(X_test1,y_test1)

0.9044796932194527

0.8574416172882537

In [18]:

# for training data
y_pred_train = pipeline.predict(X_train1)

print("Accuracy score on train data:" ,accuracy_score(y_train1,y_pred_train))
print("F1 macro score on train data:",f1_score(y_train1,y_pred_train, average='macro'))
print(classification_report(y_train1,y_pred_train))
print(confusion_matrix(y_train1,y_pred_train))

# for test data 
y_pred_test = pipeline.predict(X_test1)

print("Accuracy score on test data:" ,accuracy_score(y_test1,y_pred_test))
print("F1 macro score on test data:",f1_score(y_test1,y_pred_test, average='macro'))
print(classification_report(y_test1,y_pred_test))
print(confusion_matrix(y_test1,y_pred_test))



Accuracy score on train data: 0.9044796932194527
F1 macro score on train data: 0.772295981310684
              precision    recall  f1-score   support

           1       0.88      0.87      0.88      1780
           2       0.84      0.35      0.49      1208
           3       0.91      0.99      0.95      8486

    accuracy                           0.90     11474
   macro avg       0.88      0.74      0.77     11474
weighted avg       0.90      0.90      0.89     11474

[[1551   42  187]
 [ 171  419  618]
 [  42   36 8408]]
Accuracy score on test data: 0.8574416172882537
F1 macro score on test data: 0.6596630291635069
              precision    recall  f1-score   support

           1       0.79      0.73      0.76       445
           2       0.55      0.20      0.29       302
           3       0.88      0.98      0.93      2122

    accuracy                           0.86      2869
   macro avg       0.74      0.64      0.66      2869
weighted avg       0.83      0.86      0.83  

In [19]:
# Saving the pickle file

#pickle.dump(pipeline, open('Hotel_Rating_MLogReg.pkl','wb'))

In [20]:
# Loading the pickle file

#pipe = pickle.load(open("Hotel_Rating_MLogReg.pkl",'rb'))

#### Checking if  pickle file is fully loaded

In [21]:
# for test data predicting after laoding file from pickle .. checking accuracy of pickle file
#y_pred_test_pipe = pipe.predict(X_test1)

#print("Accuracy score on test data:" ,accuracy_score(y_test1,y_pred_test_pipe))
#print("F1 macro score on test data:",f1_score(y_test1,y_pred_test_pipe, average='macro'))
#print(classification_report(y_test1,y_pred_test_pipe))
#print(confusion_matrix(y_test1,y_pred_test_pipe))