# Importing Necessary libraries & Packages

In [33]:
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
nltk.download('wordnet')
import re
import numpy as np # linear algebra
import pandas as pd # data processing
import random
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer #Transforms text to feature vectors that can be used as input to estimator. vocabulary_ Is a dictionary that converts each token (word) to feature index in the matrix, each unique token gets a feature index.

from nltk.tokenize import word_tokenize #We use the method word_tokenize() to split a sentence into words. The output of word tokenization can be converted to Data Frame for better text understanding in machine learning applications


#import pandas_profiling

%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


### Loading The Dataset:

In [34]:
raw_dataset=pd.read_csv("Amazon_Unlocked_Mobile.csv")
raw_dataset.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


### Basic Visualization of Dataset:

In [35]:
df=raw_dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
Product Name    413840 non-null object
Brand Name      348669 non-null object
Price           407907 non-null float64
Rating          413840 non-null int64
Reviews         413778 non-null object
Review Votes    401544 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB


In [36]:
df.describe()

Unnamed: 0,Price,Rating,Review Votes
count,407907.0,413840.0,401544.0
mean,226.867155,3.819578,1.507237
std,273.006259,1.548216,9.163853
min,1.73,1.0,0.0
25%,79.99,3.0,0.0
50%,144.71,5.0,0.0
75%,269.99,5.0,1.0
max,2598.0,5.0,645.0


We only want Reviews column & rating column for our analysis so we are splitting the dataset and creating new one which consist of Reviews & Rating.

In [37]:
df=df[['Reviews','Rating']]


In [38]:
df.head()

Unnamed: 0,Reviews,Rating
0,I feel so LUCKY to have found this used (phone...,5
1,"nice phone, nice up grade from my pantach revu...",4
2,Very pleased,5
3,It works good but it goes slow sometimes but i...,4
4,Great phone to replace my lost phone. The only...,4


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    413778 non-null object
Rating     413840 non-null int64
dtypes: int64(1), object(1)
memory usage: 6.3+ MB


Removing rows with missing values

In [40]:
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413778 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    413778 non-null object
Rating     413778 non-null int64
dtypes: int64(1), object(1)
memory usage: 9.5+ MB


Removing rows of rating 3 which is neutral sentiment.

In [41]:
df=df[df['Rating'] !=3]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382015 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    382015 non-null object
Rating     382015 non-null int64
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


now we need to reset the index no. so that we can't face troule because of this.

In [42]:
df=df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382015 entries, 0 to 382014
Data columns (total 2 columns):
Reviews    382015 non-null object
Rating     382015 non-null int64
dtypes: int64(1), object(1)
memory usage: 5.8+ MB


Now we are creating a new column for our dataset which consist of Sentiment (for positive=1  & for negative=0).

In [43]:
df['sentiment']=np.where(df['Rating'] >3,1,0)
df.head()

Unnamed: 0,Reviews,Rating,sentiment
0,I feel so LUCKY to have found this used (phone...,5,1
1,"nice phone, nice up grade from my pantach revu...",4,1
2,Very pleased,5,1
3,It works good but it goes slow sometimes but i...,4,1
4,Great phone to replace my lost phone. The only...,4,1


In [44]:
df.tail()

Unnamed: 0,Reviews,Rating,sentiment
382010,good rugged phone that has a long-lasting batt...,4,1
382011,used hard,1,0
382012,another great deal great price,5,1
382013,Passes every drop test onto porcelain tile!,5,1
382014,Only downside is that apparently Verizon no lo...,4,1


# B.Data Cleaning & Text Preprocessing

In [45]:
Cstopwords=set(stopwords.words('english') +list(punctuation)) #here we store the list of stop words specifically english words in Cstopwords variable.
from nltk.stem import WordNetLemmatizer #Lemmatization is the process of converting a word to its base form. The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.
lemma=WordNetLemmatizer()

def clean_review(review_column): #here we are creating a function to clen the reviews so that it is able to do sentiment analysis
    review_corpus=[]
    for i in range(0,len(review_column)):
        review=review_column[i]
        #review=BeautifulSoup(review,'lxml').text
        review=re.sub('[^a-zA-Z]',' ',review)
        review=str(review).lower()
        review=word_tokenize(review)
        #review=[stemmer.stem(w) for w in review if w not in Cstopwords]
        review=[lemma.lemmatize(w) for w in review ]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus


In [46]:
review_column=df['Reviews']
review_corpus=clean_review(review_column)

now we will create a new column which consist of clean reviews

In [47]:
df['clean_reviews'] = review_corpus
df.tail(20)

Unnamed: 0,Reviews,Rating,sentiment,clean_reviews
381995,"This phone is simple, very good , and it works...",5,1,this phone is simple very good and it work exc...
381996,Good sturdy phone for a pre-teen to have avail...,4,1,good sturdy phone for a pre teen to have avail...
381997,This is the second junk Convoy I have owned. T...,1,0,this is the second junk convoy i have owned th...
381998,I BOUGHT THIS PHONE FOR MY HUSBAND AND HE LOVE...,5,1,i bought this phone for my husband and he love...
381999,They said phone was normal wear but it was a l...,1,0,they said phone wa normal wear but it wa a lie...
382000,"You could shoot this out of a potato gun, and ...",5,1,you could shoot this out of a potato gun and p...
382001,Bought this for my mother and she loves it. Gr...,5,1,bought this for my mother and she love it grea...
382002,"Excellent phone, as advertised. Love the push-...",5,1,excellent phone a advertised love the push to ...
382003,works great and picks up signal in place my ot...,4,1,work great and pick up signal in place my othe...
382004,"Great phone. Large keys, best flip phone I hav...",5,1,great phone large key best flip phone i have o...


# C.Creating Features

### 1. CountVectorizer

In [48]:
from sklearn.feature_extraction.text import CountVectorizer #this is used to convert words into a set of vectors.

In [49]:
cv=CountVectorizer(max_features=20000,min_df=5,ngram_range=(1,2)) #ngram_range join two words e.g : i am groot , it will join 2 grams as weather ('i am') or ('am groot')

#min_df :These are the minimum and maximum document frequencies words/n-grams must have to be used as features. If either of these parameters are set to integers, they will be used as bounds on the number of documents each feature must be in to be considered as a feature.in our case we give it's value 5 because we are dealing with 5 types of rating.

#max_features :  The CountVectorizer will choose the words/features that occur most frequently to be in its’ vocabulary and drop everything else.

In [51]:
X1=cv.fit_transform(df['clean_reviews']) #here we are taking the clean_reviews column and Transform them using CountVectorizer
X1.shape

(382015, 20000)

# Tfidf

Tf-idf stands for term frequency-inverse document frequency, and the tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus. Variations of the tf-idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query.

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tfidf=TfidfVectorizer(min_df=5, max_df=0.95, max_features = 20000, ngram_range = ( 1, 2 ),sublinear_tf = True) #max_df: ignore the words that have greater probabilty from the given threshold which is in our case 0.95. similarly the case with min_df

In [54]:
tfidf=tfidf.fit(df['clean_reviews']) #now we transform clean_reviews column again but this time using Tfid Vectorizer

In [55]:
X2=tfidf.transform(df['clean_reviews'])
X2.shape

(382015, 20000)

In [58]:
y=df['sentiment'].values
y.shape

(382015,)

# D.Machine Learning

Spliting the data into Train & Test Set.

In [59]:
X=X1 #X1 for Bag of words model and X2 for Tfidf model

In [60]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(305612, 20000) (305612,)
(76403, 20000) (76403,)


In [61]:
# average positive reviews in train and test
print('mean positive review in train : {0:.3f}'.format(np.mean(y_train)))
print('mean positive review in test : {0:.3f}'.format(np.mean(y_test)))

mean positive review in train : 0.746
mean positive review in test : 0.745


### Naive Bayes Classifier algorithm:

In [62]:
from sklearn.naive_bayes import MultinomialNB
model_nb=MultinomialNB()
model_nb.fit(X_train,y_train)
y_pred_nb=model_nb.predict(X_test)
print('accuracy for Naive Bayes Classifier :',accuracy_score(y_test,y_pred_nb))
print('confusion matrix for Naive Bayes Classifier:\n',confusion_matrix(y_test,y_pred_nb))

accuracy for Naive Bayes Classifier : 0.93337957933589
confusion matrix for Naive Bayes Classifier:
 [[17047  2418]
 [ 2672 54266]]


# Conclusion:

now our model is train and it is successfully categories reviews b/w positive and negative , positive for 1 and negative is 0

In [74]:
print(model_nb.predict(cv.transform(['not an issue, phone is working',    #positive review
                                    'an issue, phone is not working'])))  #negative review

[1 0]


In [76]:
print(model_nb.predict(cv.transform([' phone was not working'])))         #negative review

[0]


In [77]:
print(model_nb.predict(cv.transform(["phone was great is more suitable for elders"])))   #positive review

[1]
