In [1]:
import nltk
import numpy as np
import pandas as pd
from textblob import TextBlob



In [2]:
import pandas as pd
data = pd.read_csv('amazon_final.csv')
data

Unnamed: 0.1,Unnamed: 0,productCode,star,review
0,0,B09G95MCDT,4,\nGood to have monthly payments but so quickly...
1,1,B09G95MCDT,5,"\nBefore we start, apologies for the long revi..."
2,2,B09G95MCDT,5,\nNice looking phone there seems to be a lot o...
3,3,B09G95MCDT,5,\nNew product and prompt delivery!\n
4,4,B09G95MCDT,4,\nPhone is as expected from an iPhone 13. Not ...
...,...,...,...,...
13222,13222,B08L5PVZ9Y,5,\nSo happy with this product and delivery was ...
13223,13223,B08L5PVZ9Y,5,\nGood item\n
13224,13224,B08L5PVZ9Y,5,\nI bought this phone for my other half after ...
13225,13225,B08L5PVZ9Y,5,\nLovely phone dose what I need it to do\n


In [3]:
data.dropna(axis=0,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13120 entries, 0 to 13226
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   13120 non-null  int64 
 1   productCode  13120 non-null  object
 2   star         13120 non-null  int64 
 3   review       13120 non-null  object
dtypes: int64(2), object(2)
memory usage: 512.5+ KB


In [4]:
data.review

0        \nGood to have monthly payments but so quickly...
1        \nBefore we start, apologies for the long revi...
2        \nNice looking phone there seems to be a lot o...
3                     \nNew product and prompt delivery!\n
4        \nPhone is as expected from an iPhone 13. Not ...
                               ...                        
13222    \nSo happy with this product and delivery was ...
13223                                        \nGood item\n
13224    \nI bought this phone for my other half after ...
13225           \nLovely phone dose what I need it to do\n
13226                                    \nPerfect phone\n
Name: review, Length: 13120, dtype: object

In [5]:
data.star.value_counts(normalize=True)

5    0.661128
1    0.131860
4    0.105335
3    0.058155
2    0.043521
Name: star, dtype: float64

In [6]:
# Removing the reviews given 3 stars because they indicate neither positive nor negative mood.
data = data[data.star!=3]

# 4/5 Star --> Positive, 1/2 Star --> Negative
data['sentiment'] = data['star'].apply(lambda x: (x>=4 and 'Positive') or 'Negative')
data = data[['sentiment','star','review']]
data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['star'].apply(lambda x: (x>=4 and 'Positive') or 'Negative')


Unnamed: 0,sentiment,star,review
0,Positive,4,\nGood to have monthly payments but so quickly...
1,Positive,5,"\nBefore we start, apologies for the long revi..."
2,Positive,5,\nNice looking phone there seems to be a lot o...
3,Positive,5,\nNew product and prompt delivery!\n
4,Positive,4,\nPhone is as expected from an iPhone 13. Not ...
5,Positive,5,\nSon's present from Santa\n
6,Positive,4,\niPhones have a quality feel to them. This on...
7,Positive,5,\nMy previous phone was an iPhone X and when i...
8,Positive,5,"\niPhone, bought as present so can’t really wr..."
9,Positive,5,\nSecond one l have bought for my wife good pr...


In [7]:
# Checking the row number after deducting the reviews with 3 stars
len(data)

12357

In [8]:
# A large part of our dataset consists of comments showing positive mood
data.sentiment.value_counts(normalize=True)

Positive    0.81379
Negative    0.18621
Name: sentiment, dtype: float64

In [9]:
import re
import string

# Removing the \n expressions from the reviews
data['review'] = data['review'].str.replace("\n","")
# Removing the numbers from the reviews
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
# Removing the punctations from the reviews ve converting all letters to lowercase
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) 

data['review'] = data.review.map(alphanumeric).map(punc_lower)
data

Unnamed: 0,sentiment,star,review
0,Positive,4,good to have monthly payments but so quickly o...
1,Positive,5,before we start apologies for the long review...
2,Positive,5,nice looking phone there seems to be a lot of ...
3,Positive,5,new product and prompt delivery
4,Positive,4,phone is as expected from an iphone not muc...
...,...,...,...
13222,Positive,5,so happy with this product and delivery was sp...
13223,Positive,5,good item
13224,Positive,5,i bought this phone for my other half after th...
13225,Positive,5,lovely phone dose what i need it to do


In [10]:
#Other stemmer methods can be tried
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import ISRIStemmer
from nltk.stem import PorterStemmer
from nltk.stem import RegexpStemmer
from nltk.stem import RSLPStemmer

sbs = SnowballStemmer(language='english')

In [11]:
def stemmer(text):
    text = [sbs.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

data.review = data.review.apply(stemmer)
data.review

0        good to have month payment but so quick out of...
1        befor we start  apolog for the long review  bu...
2        nice look phone there seem to be a lot of tin ...
3                         new product and prompt deliveri 
4        phone is as expect from an iphon    not much e...
                               ...                        
13222    so happi with this product and deliveri was sp...
13223                                            good item
13224    i bought this phone for my other half after th...
13225                 love phone dose what i need it to do
13226                                        perfect phone
Name: review, Length: 12357, dtype: object

In [None]:
from textblob import TextBlob

def correct(text):
    text = [str(TextBlob(word).correct()) for word in text.split(' ')]
    text = " ".join(text)
    return text

data.review = data.review.apply(correct)
data.review

In [None]:
X = data.reviews
y = data.sentiment

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)