# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load and Explore the Dataset

In [3]:
df = pd.read_csv("train.csv")

In [4]:
print(df.head(10))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


In [5]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


# Text Preprocessing

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
def preprocess_text(text):
    
    text = text.lower()

    
    text = ''.join([char for char in text if char not in string.punctuation])

    
    words = nltk.word_tokenize(text)

    
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    
    text = ' '.join(words)

    return text

In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
df['cleaned_text'] = df['review'].apply(preprocess_text)

In [None]:
print(df[['review', 'cleaned_text']].head(10))

# Exploratory Data Analysis (EDA)

In [None]:
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution')
plt.show()

In [None]:
from wordcloud import WordCloud

positive_reviews = ' '.join(df[df['sentiment'] == 'positive']['cleaned_text'])
negative_reviews = ' '.join(df[df['sentiment'] == 'negative']['cleaned_text'])

In [None]:
positive_wordcloud = WordCloud(width=800, height=400).generate(positive_reviews)
negative_wordcloud = WordCloud(width=800, height=400).generate(negative_reviews)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Positive Reviews Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Negative Reviews Word Cloud')
plt.axis('off')

plt.show()

# Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
X = tfidf.fit_transform(df['cleaned_text']).toarray()

In [None]:
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

In [None]:
print("Feature Shape:", X.shape)

In [None]:
print("Labels Shape:", y.shape)

# Model Building & Baseline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'] 
}

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print('Best Hyperparameters:', grid_search.best_params_)

In [None]:
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

In [None]:
print('Tuned Accuracy:', accuracy_score(y_test, y_pred_tuned))
print('Tuned Classification Report:')
print(classification_report(y_test, y_pred_tuned))

# Evaluation & Interpretation

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
cm = confusion_matrix(y_test, y_pred_tuned)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Final Model & Documentation

In [None]:
import joblib

In [None]:
joblib.dump(best_model, 'sentiment_analysis_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

In [None]:
print("Model and Vectorizer saved successfully!")