<a href="https://colab.research.google.com/github/laibamushtaq0/data-science-internship-2025/blob/main/task2_SentimentalAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pandas numpy scikit-learn nltk datasets joblib --quiet

Note: you may need to restart the kernel to use updated packages.


# Data Preparation

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

df = dataset["train"].to_pandas()

df = df.rename(columns={"text": "review", "label": "sentiment"})

print(df.head())

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

                                              review  sentiment
0  I rented I AM CURIOUS-YELLOW from my video sto...          0
1  "I Am Curious: Yellow" is a risible and preten...          0
2  If only to avoid making this type of film in t...          0
3  This film was probably inspired by Godard's Ma...          0
4  Oh, brother...after hearing about this ridicul...          0


## Split data

In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

# Text Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

train_texts = train_texts.apply(preprocess_text)
test_texts = test_texts.apply(preprocess_text)

print(train_texts[:5])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
23311    borrowed movie despite extremely low rating wa...
23623    unexpected accident killed inexperienced climb...
1020     summer blockbuster hit baseketball one movies ...
12645    scarcely imagine better movie thishey go chick...
1533     still famous decadent actor morgan freeman fil...
Name: review, dtype: object


# Convert Text to Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

print(X_train.shape)

(20000, 5000)


# Training Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, train_labels)

y_pred = model.predict(X_test)

accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.88


# Testing

In [None]:
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return "Positive" if prediction == 1 else "Negative"

In [None]:
import ipywidgets as widgets

output = widgets.Output()

text_input = widgets.Text(
    description='Input Text:',
    placeholder='Type something here'
)

submit_btn = widgets.Button(
    description='Submit',
    button_style='info'
)

def process_input(_):
    text = text_input.value

    with output:
        output.clear_output()
        print(predict_sentiment(text))

submit_btn.on_click(process_input)

layout = widgets.VBox([
    text_input,
    submit_btn,
    output
])

layout

VBox(children=(Text(value='', description='Input Text:', placeholder='Type something here'), Button(button_sty…

# Deployment

In [None]:
import joblib

joblib.dump(model, 'sentiment_model.pkl')

['sentiment_model.pkl']

## Create pipeline to bundle together model and vectorizer

In [None]:
from sklearn.pipeline import Pipeline
import joblib

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])


joblib.dump(pipeline, 'sentiment_analysis_pipeline.pkl')

['sentiment_analysis_pipeline.pkl']