In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import re

# Load the data
data = pd.read_csv("D:/CODING/coding/Minor 2/Sentiment-Analysis/Data/amazon_alexa.tsv", delimiter='\t', quoting=3)

# Drop rows with missing reviews
data.dropna(subset=['verified_reviews'], inplace=True)

# Create corpus with cleaned text
corpus = []
ps = PorterStemmer()
for review in data['verified_reviews']:
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in STOPWORDS]
    corpus.append(' '.join(review))

# Convert text to numerical features
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

# Set the target variable
y = data['feedback'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and vectorizer
pickle.dump(cv, open("D:/CODING/coding/Minor 2/MySentimentAnalysis/Models/countVectorizer.pkl", "wb"))
pickle.dump(scaler, open("D:/CODING/coding/Minor 2/MySentimentAnalysis/Models/scaler.pkl", "wb"))
pickle.dump(model, open("D:/CODING/coding/Minor 2/MySentimentAnalysis/Models/model_rf.pkl", "wb"))
print("Models trained")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9301587301587302
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.38      0.52        63
           1       0.94      0.99      0.96       567

    accuracy                           0.93       630
   macro avg       0.88      0.69      0.74       630
weighted avg       0.92      0.93      0.92       630

Confusion Matrix:
 [[ 24  39]
 [  5 562]]
Models trained
