In [88]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df = pd.read_csv('Alexa Reviews.csv')
df.head(100)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
95,5,30-Jul-18,Charcoal Fabric,"We like it all so far, but have a lot to try o...",1
96,5,30-Jul-18,Heather Gray Fabric,Love it - am using it strictly for music now a...,1
97,5,30-Jul-18,Sandstone Fabric,"Love the Echo !!! I love the size, material an...",1
98,4,30-Jul-18,Charcoal Fabric,Love it!,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0    rating           3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3149 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [9]:
df['feedback'].value_counts()

feedback
1    2893
0     257
Name: count, dtype: int64

In [10]:
df.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [50]:
# Count distribution of feedback
df['feedback'].value_counts(normalize=True)
df_majority = df[df['feedback'] == 1]
df_minority = df[df['feedback'] == 0]

In [25]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [74]:
df_minority_upsampled = resample(
    df_minority,
    replace=True,                        
    n_samples=len(df_majority),         
    random_state=42
)

In [75]:
df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced['feedback'].value_counts(normalize=True)

feedback
0    0.5
1    0.5
Name: proportion, dtype: float64

In [None]:
def clean_text(text):
    text=text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = nltk.word_tokenize(text)

    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [77]:
df['verified_reviews'] = df['verified_reviews'].fillna('').astype(str)

In [78]:
df['cleaned_reviews'] = df['verified_reviews'].apply(clean_text)

In [79]:
df[['verified_reviews', 'cleaned_reviews']].head(10)

Unnamed: 0,verified_reviews,cleaned_reviews
0,Love my Echo!,love echo
1,Loved it!,loved
2,"Sometimes while playing a game, you can answer...",sometimes playing game answer question correct...
3,I have had a lot of fun with this thing. My 4 ...,lot fun thing yr old learns dinosaur control l...
4,Music,music
5,I received the echo as a gift. I needed anothe...,received echo gift needed another bluetooth so...
6,"Without having a cellphone, I cannot use many ...",without cellphone use many feature ipad see us...
7,I think this is the 5th one I've purchased. I'...,think th one ive purchased im working getting ...
8,looks great,look great
9,Love it! I’ve listened to songs I haven’t hear...,love ive listened song havent heard since chil...


In [80]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X = tfidf.fit_transform(df_balanced['cleaned_reviews'])
y= df_balanced['feedback']

In [81]:
print("TF-IDF Matrix shape:", X.shape)

TF-IDF Matrix shape: (5786, 5000)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [83]:
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (4628, 5000)
Testing set size: (1158, 5000)


In [84]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

In [85]:
y_pred = model.predict(X_test)

In [86]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9550949913644214

Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       579
           1       1.00      0.91      0.95       579

    accuracy                           0.96      1158
   macro avg       0.96      0.96      0.96      1158
weighted avg       0.96      0.96      0.96      1158



In [87]:
def predict_sentiment(review_text):
    cleaned = clean_text(str(review_text))               
    vectorized = tfidf.transform([cleaned])                
    
    prediction = model.predict(vectorized)[0]
    probability = model.predict_proba(vectorized)[0]

    sentiment = "Positive 😀" if prediction == 1 else "Negative 😞"
    print(f"\nOriginal Review: {review_text}")
    print(f"Cleaned Review: {cleaned}")
    print(f"Predicted Sentiment: {sentiment}")
    print(f"Confidence: {round(probability[prediction] * 100, 2)}%")

predict_sentiment("This Alexa is terrible and doesn’t understand anything I say.")
predict_sentiment("Absolutely love this device! It’s my new best friend.")
predict_sentiment("It works fine, but the setup was a bit annoying.")


Original Review: This Alexa is terrible and doesn’t understand anything I say.
Cleaned Review: alexa terrible doesnt understand anything say
Predicted Sentiment: Negative 😞
Confidence: 79.18%

Original Review: Absolutely love this device! It’s my new best friend.
Cleaned Review: absolutely love device new best friend
Predicted Sentiment: Positive 😀
Confidence: 94.2%

Original Review: It works fine, but the setup was a bit annoying.
Cleaned Review: work fine setup bit annoying
Predicted Sentiment: Positive 😀
Confidence: 55.97%


In [None]:
import joblib

joblib.dump(model, 'sentiment_model.pkl')

joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
loaded_model = joblib.load('sentiment_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

def predict_sentiment(text):
    cleaned = clean_text(text)
    vectorized = loaded_vectorizer.transform([cleaned])
    pred = loaded_model.predict(vectorized)[0]
    return "Positive 😀" if pred == 1 else "Negative 😞"


'Positive 😀'