In [146]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sid321axn/amazon-alexa-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/amazon-alexa-reviews


In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [148]:
data=pd.read_csv('/content/amazon_alexa.tsv',sep='\t')
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",1
3,5,31-Jul-18,Charcoal Fabric,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [149]:
data=data.drop(['rating','date'],axis=1)
data.head()

Unnamed: 0,variation,verified_reviews,feedback
0,Charcoal Fabric,Love my Echo!,1
1,Charcoal Fabric,Loved it!,1
2,Walnut Finish,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",1
3,Charcoal Fabric,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",1
4,Charcoal Fabric,Music,1


#Data Exploring and cleaning

In [150]:
data.shape

(3150, 3)

In [151]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   variation         3150 non-null   object
 1   verified_reviews  3149 non-null   object
 2   feedback          3150 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 74.0+ KB


In [152]:
data.feedback.value_counts()

Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2893
0,257


There are Imbalanced Data. we will handle it using SMOTE


In [153]:
# We have 16 types of Amazon Alexa
print(data.variation.value_counts())

variation
Black  Dot                      516
Charcoal Fabric                 430
Configuration: Fire TV Stick    350
Black  Plus                     270
Black  Show                     265
Black                           261
Black  Spot                     241
White  Dot                      184
Heather Gray Fabric             157
White  Spot                     109
White                            91
Sandstone Fabric                 90
White  Show                      85
White  Plus                      78
Oak Finish                       14
Walnut Finish                     9
Name: count, dtype: int64


In [154]:
# Are there null values or not?
data.isnull().sum()

Unnamed: 0,0
variation,0
verified_reviews,1
feedback,0


In [155]:
# Get the null value row
data[data['verified_reviews'].isna() == True]

Unnamed: 0,variation,verified_reviews,feedback
473,White,,0


In [156]:
# Delete null value
data.dropna(inplace=True)

In [157]:
# Check again
data.isnull().sum()

Unnamed: 0,0
variation,0
verified_reviews,0
feedback,0


In [158]:
# Is there a duplicate value  or not?
print(data.duplicated().sum())

761


In [159]:
data=data.drop_duplicates()

In [160]:
print(data.duplicated().sum())

0


In [161]:
data.shape

(2388, 3)

In [162]:
# verified_reviews
print(data.verified_reviews[[20]])

20    Love the Echo and how good the music sounds playing off it. Alexa understands most commands but it is difficult at times for her to find specific playlists or songs on Spotify. She is good with Amazon Music but is lacking in other major programs.
Name: verified_reviews, dtype: object


In [163]:
print(str(data.verified_reviews[20]))


Love the Echo and how good the music sounds playing off it. Alexa understands most commands but it is difficult at times for her to find specific playlists or songs on Spotify. She is good with Amazon Music but is lacking in other major programs.


In [164]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
print(data.verified_reviews[20])


Love the Echo and how good the music sounds playing off it. Alexa understands most commands but it is difficult at times for her to find specific playlists or songs on Spotify. She is good with Amazon Music but is lacking in other major programs.


In [165]:
negative_reviews = data[data['feedback'] == 0]['verified_reviews']
negative_reviews


Unnamed: 0,verified_reviews
46,"It's like Siri, in fact, Siri answers more accurately then Alexa. I don't see a real need for it in my household, though it was a good bargain on prime day deals."
111,Sound is terrible if u want good music too get a bose
141,Not much features.
162,"Stopped working after 2 weeks ,didn't follow commands!? Really fun when it was working?"
176,Sad joke. Worthless.
...,...
2696,Echo Dot responds to us when we aren't even talking to it. I've unplugged it. It feels like it's &#34;spying&#34; on us.
2697,NOT CONNECTED TO MY PHONE PLAYLIST :(
2716,The only negative we have on this product is the terrible sound quality. A massive difference from the Alexa. Which to us was a big reason we wanted to purchase this.Won’t be buying another until the speaker and sound quality can improve.
2740,I didn’t order it


# Text preprocessing

In [166]:
import pandas as pd

reviews_with_numbers = data[data['verified_reviews'].str.contains(r'\d')]['verified_reviews']

reviews_with_numbers


Unnamed: 0,verified_reviews
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well."
7,I think this is the 5th one I've purchased. I'm working on getting one in every room of my house. I really like what features they offer specifily playing music on all Echos and controlling the lights throughout my house.
10,"I sent it to my 85 year old Dad, and he talks to it constantly."
18,We love the size of the 2nd generation echo. Still needs a little improvement on sound
24,"I got a second unit for the bedroom, I was expecting the sounds to be improved but I didnt really see a difference at all. Overall, not a big improvement over the 1st generation."
...,...
2742,Use as my 3rd dot. Bought so my husband would be able to contact someone if he falls and doesn't have access to phone.
2756,Easy to connect and the skills created for our Echo 2nd Gen worked fine with the Dot.
2761,This is my second Echo to purchase -- the price was so cheap -- I'm seriously thinking of buying 2 more -- I use &#34;Alexa&#34; for everything -- it's the next best thing to the internet :-*
2773,"Love my Alexa! Actually have 3 throughout the house. Favorite function is lists. Great for time management and if course, music."


In [167]:
#Removing Punctuation & Special Characters using Reg
import re

def clean_text_keep_numbers(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

data['verified_reviews'] = data['verified_reviews'].apply(clean_text_keep_numbers)


In [168]:
print (str(data.verified_reviews[2778]))

we have six of these throughout our home and they are great  there are times when music or talking happens randomly that may be from sensitive listening  there should be more options to change the name of the engagement command 34alexa34 or 34computer34 are a little annoying  more colors then black and white would also be a good thing


In [169]:
#now we  split the text into token by word_tokenize  and remove stop word  using nltk (Natural Language Toolkit)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt') # For tokenization.It uses statistical models to identify sentence or word boundaries


stop_words = set(stopwords.words('english'))

def remove_stopwords_keep_numbers(text):
    words = word_tokenize(text)
    filtered = [w for w in words if w not in stop_words]
    return  filtered

data['verified_reviews'] = data['verified_reviews'].apply(remove_stopwords_keep_numbers)

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    lemmatized = [lemmatizer.lemmatize(w) for w in tokens]
    return lemmatized

data['verified_reviews'] = data['verified_reviews'].apply(lemmatize_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [170]:
data.verified_reviews[0]

['love', 'echo']

#Spliting data and vectorization

In [174]:
X = data['verified_reviews'].apply(lambda x: ' '.join(x))

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf_vectorizer.transform(x_test)


#SMOTE Upsampling


In [175]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_tfidf_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

In [177]:
from collections import Counter
Counter(y_train_balanced)



Counter({1: 1742, 0: 1742})

#Model

DecisionTreeClassifier

In [198]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(
        criterion='entropy',
        splitter='best')
classifier.fit(X_train_tfidf_balanced, y_train_balanced)


In [200]:
y_pred = classifier.predict(X_test_tfidf)


In [201]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8765690376569037


Naive Bayess Classifier Moodel

In [203]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train_tfidf_balanced, y_train_balanced)


In [205]:
y_predict_train = NB_classifier.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_predict_train))


Accuracy: 0.9037656903765691
