In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


/content/drive/MyDrive

In [4]:
review_data = pd.read_json('/content/drive/MyDrive/Colab Notebooks/reviews_Musical_Instruments_5.json', lines=True)

In [5]:
review_data[['reviewText', 'overall']].head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5
1,The product does exactly as it should and is q...,5
2,The primary job of this device is to block the...,5
3,Nice windscreen protects my MXL mic and preven...,5
4,This pop filter is great. It looks and perform...,5


We'll use a lambda function to extract tokens from each 'reviewText' of this DataFrame, lemmatize them, and concatenate them side by side. We use the join function to concatenate a list of words into a single sentence. We use the regular expression (re) to replace anything other than alphabets, digits, and white spaces with blank space.

In [8]:
import nltk
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
review_data['cleaned_review_text'] = review_data['reviewText'].apply(\
lambda x : ' '.join([lemmatizer.lemmatize(word.lower()) \
    for word in word_tokenize(re.sub(r'([^\s\w]|_)+', ' ', str(x)))]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [9]:
# Create a DataFrame from the TFIDF matrix representation of the cleaned version of the reviewText
review_data[['cleaned_review_text', 'reviewText','overall']].head()

Unnamed: 0,cleaned_review_text,reviewText,overall
0,not much to write about here but it doe exactl...,"Not much to write about here, but it does exac...",5
1,the product doe exactly a it should and is qui...,The product does exactly as it should and is q...,5
2,the primary job of this device is to block the...,The primary job of this device is to block the...,5
3,nice windscreen protects my mxl mic and preven...,Nice windscreen protects my MXL mic and preven...,5
4,this pop filter is great it look and performs ...,This pop filter is great. It looks and perform...,5


In [10]:
#create a tf-idf matrix and transform it into dataframe
tfidf_model = TfidfVectorizer(max_features=500)
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(review_data['cleaned_review_text']).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

Unnamed: 0,10,100,12,20,34,able,about,accurate,acoustic,actually,...,won,work,worked,worth,would,wrong,year,yet,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.159684,0.0,0.0,0.0,...,0.0,0.134327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.085436,0.0,0.0,0.0,0.0,0.0,0.0,0.067074,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.115312,0.0,0.0,0.0,0.07988,0.111989
3,0.0,0.0,0.0,0.0,0.0,0.339573,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303608,0.0


In [11]:
# we do need to create a new column target wich will have 0 if the overall parameter is less than 4 and 1 otherwise
review_data['target'] = review_data['overall'].apply(lambda x : 0 if x<=4 else 1)
review_data['target'].value_counts()

1    6938
0    3323
Name: target, dtype: int64

In [12]:
# We will use the sklearn logistic regresion function to fit a logistic regression model on the TF-IDF representation of these reviews after cleaning
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(tfidf_df,review_data['target'])
predicted_labels = logreg.predict(tfidf_df)
logreg.predict_proba(tfidf_df)[:,1]

array([0.57146961, 0.68579907, 0.56068939, ..., 0.65979968, 0.5495679 ,
       0.21186011])

1. `logreg = LogisticRegression()`will create an instance of the LogisticRegression class, stores it into the variable `logreg`.

2. `logreg.fit(tfidf_df,review_data['target'])`: the `fit` method is used to train the logistic regression model on our data. The `fit` method takes in two parameters: the feature data (`tfidf_df` in this case, which presumably is the TF-IDF (Term Frequency-Inverse Document Frequency) transformed data of some text) and the actual labels (`review_data['target']` in this case).

3. `predicted_labels = logreg.predict(tfidf_df)`: Once trained, we use the model to predict the labels for our feature data (`tfidf_df`), these predictions are stored in `predicted_labels`.

4. `logreg.predict_proba(tfidf_df)[:,1]`: Finally, the `predict_proba` method provides the probabilities for the target in the dataset, as predicted by the logistic regression model. It gives the probability that the data belongs to a particular class. The output is a 2D array where the second column (indexed by `[:,1]`) indicates the probability that the sample belongs to class 1. In the context of a review, assuming that class 1 is a positive review and class 0 is a negative review, this is telling us how likely the review is to be positive, according to the logistic regression model.

In [13]:
# we will use the cross tab function of pandas to compare the result of our classification model with the actual classes
review_data['predicted_labels'] = predicted_labels
pd.crosstab(review_data['target'], review_data['predicted_labels'])

predicted_labels,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1543,1780
1,626,6312


In [14]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(tfidf_df,review_data['target'])
predicted_labels = nb.predict(tfidf_df)
nb.predict_proba(tfidf_df)[:,1]

array([9.97730158e-01, 3.63599675e-09, 9.45692105e-07, ...,
       2.46001047e-02, 3.43660991e-08, 1.72767906e-27])

In [15]:
# we will use the crosstab function of pandas to comapre the results of our classification model with the actual classes
review_data['predicted_labels_nb'] = predicted_labels
pd.crosstab(review_data['target'], review_data['predicted_labels_nb'])

predicted_labels_nb,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2333,990
1,2380,4558


We will use sklearn's KNeighborsClassifier() function to fit a 3-Nearest Neighbour model on the TF-IDF representation of these reviews after cleaning. We'll further use the crosstab function of pandas to compare the results of our classification model with the actual classes.

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_df,review_data['target'])

In [17]:
review_data['predicted_labels_knn'] = knn.predict(tfidf_df)
pd.crosstab(review_data['target'], review_data['predicted_labels_knn'])

predicted_labels_knn,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2681,642
1,333,6605


2594 instances with the target label as 0 correctly classified and 729 such instances wrongly classified. Furthermore, 6563 instances with the target label as 1 are correctly classified, whereas 375 such instances are wrongly classified.