# Review Recommendation Prediction Using Natural Language Processing Methods

### Importing the libraries and the data

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

fulldata = pd.read_csv('data.csv')
data = fulldata[['Title', 'Review Text', 'Recommended IND']]

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

In [2]:
data.head(10)

Unnamed: 0,Title,Review Text,Recommended IND
0,,Absolutely wonderful - silky and sexy and comf...,1
1,,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flaws,I had such high hopes for this dress and reall...,0
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1
4,Flattering shirt,This shirt is very flattering to all due to th...,1
5,Not for the very petite,"I love tracy reese dresses, but this one is no...",0
6,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,1
7,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",1
8,Flattering,I love this dress. i usually get an xs but it ...,1
9,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",1


In [3]:
data.shape

(23486, 3)

### Filling the null data

In [4]:
data.isnull().sum()

Title              3810
Review Text         845
Recommended IND       0
dtype: int64

In [5]:
(data['Title'].isnull() & data['Review Text'].isnull()).sum()

844

In [6]:
data['Title'] = np.where(data['Title'].isnull(), '', data['Title'])
data['Review Text'] = np.where(data['Review Text'].isnull(), '', data['Review Text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combining the title and the review body

In [7]:
data['fullreview'] = data['Title'].map(str) +'. ' + data['Review Text'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
data.head()

Unnamed: 0,Title,Review Text,Recommended IND,fullreview
0,,Absolutely wonderful - silky and sexy and comf...,1,. Absolutely wonderful - silky and sexy and co...
1,,Love this dress! it's sooo pretty. i happene...,1,. Love this dress! it's sooo pretty. i happe...
2,Some major design flaws,I had such high hopes for this dress and reall...,0,Some major design flaws. I had such high hopes...
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1,"My favorite buy!. I love, love, love this jump..."
4,Flattering shirt,This shirt is very flattering to all due to th...,1,Flattering shirt. This shirt is very flatterin...


In [9]:
print('Recommended: {}'.format(len(data[data['Recommended IND'] == 1])))
print('Not Recommended: {}'.format(len(data[data['Recommended IND'] == 0])))

Recommended: 19314
Not Recommended: 4172


### Removing punctiation and stopwords, then lemmatizing

In [10]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

### Splitting the data to training and test data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data['fullreview'], data['Recommended IND'], test_size=0.2)

### Vectorizing

In [12]:
#Count vectorizer
count_vect = CountVectorizer(analyzer=clean_text)
count_vect_fit = count_vect.fit(X_train)

count_train = count_vect_fit.transform(X_train)
count_test = count_vect_fit.transform(X_test)

X_train_vect1 = pd.concat([pd.DataFrame(count_train.toarray())], axis=1)
X_test_vect1 = pd.concat([pd.DataFrame(count_test.toarray())], axis=1)

#TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train)

tfidf_train = tfidf_vect_fit.transform(X_train)
tfidf_test = tfidf_vect_fit.transform(X_test)

X_train_vect2 = pd.concat([pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect2 = pd.concat([pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16443,16444,16445,16446,16447,16448,16449,16450,16451,16452
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.117758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.176838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model Building, predicting, comparison and the final results

### Model with Count Vectorizing

In [13]:
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [14]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect1, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect1)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 28.744 / Predict time: 0.317 ---- Precision: 0.858 / Recall: 0.995 / Accuracy: 0.861


### Model with TF-IDF Vectorizing

In [15]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect2, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect2)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 30.039 / Predict time: 0.321 ---- Precision: 0.854 / Recall: 0.994 / Accuracy: 0.856
