In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:/Users/himas/Downloads/Reviewsamazon.csv") 

In [3]:
df = df[df['Score'] != 3]
df['Sentiment'] = df['Score'].apply(lambda x: 'Positive' if x > 3 else 'Negative')
print(df['Sentiment'].value_counts())

Sentiment
Positive    443777
Negative     82037
Name: count, dtype: int64


In [4]:
df = df[['Text', 'Sentiment']]
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [5]:
df = df[['Text', 'Sentiment']]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Text,Sentiment
0,I have bought several of the Vitality canned d...,Positive
1,Product arrived labeled as Jumbo Salted Peanut...,Negative
2,This is a confection that has been around a fe...,Positive
3,If you are looking for the secret ingredient i...,Negative
4,Great taffy at a great price. There was a wid...,Positive


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)
df['Clean_Text'] = df['Text'].apply(clean_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)  
X = tfidf.fit_transform(df['Clean_Text'])
y = df['Sentiment']

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9290149577322823

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.68      0.75     16407
           1       0.94      0.98      0.96     88756

    accuracy                           0.93    105163
   macro avg       0.89      0.83      0.85    105163
weighted avg       0.93      0.93      0.93    105163



In [12]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_pred))

SVM Accuracy: 0.9310974392134116

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.70      0.76     16407
           1       0.95      0.97      0.96     88756

    accuracy                           0.93    105163
   macro avg       0.89      0.84      0.86    105163
weighted avg       0.93      0.93      0.93    105163



In [13]:
import joblib

In [14]:
joblib.dump(svm_model, 'sentiment_svm_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']