<a href="https://colab.research.google.com/github/kella-swarna/Sentiment_Analysis.ipynb/blob/main/sentiment_analysis02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
snap_amazon_fine_food_reviews_path = kagglehub.dataset_download('snap/amazon-fine-food-reviews')

print('Data source import complete.')


In [None]:
# STEP 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import re
import string


In [None]:
# STEP 2: Load the Dataset (adjust path if needed)
df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')
df = df[['Text', 'Score']].dropna()

# Binarize sentiment (Positive = 1 if Score >= 4, Negative = 0 if Score <= 2)
df = df[df['Score'] != 3]  # Remove neutral
df['sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)
df.rename(columns={'Text': 'review'}, inplace=True)

df = df[['review', 'sentiment']].sample(frac=1, random_state=42).reset_index(drop=True)
df.head()


Unnamed: 0,review,sentiment
0,This is a very high quality dog food with meat...,1
1,I love this cake mix and the other 3 mixes as ...,1
2,A nice strong brew. I am new to Keurig and hav...,1
3,I just found PB2 and PB2 with chocolate and I ...,1
4,Delightful mint tea as one would expect. Note ...,1


In [None]:
# STEP 3: Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text

df['clean_review'] = df['review'].apply(clean_text)


In [None]:
# STEP 4: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_review'])

y = df['sentiment']


In [None]:
# STEP 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# STEP 6: Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
