In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from pathlib import Path
import re, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
neg_path = "aclImdb/train/neg" 
pos_path = "aclImdb/train/pos"

In [None]:
for filename in os.listdir(neg_path):
    file_path = os.path.join(neg_path, filename)
    if os.path.isfile(file_path):
        try:
            with open(file_path, 'r') as f:
                content = f.read()
                print(f"Content of {filename}:\n{content}\n---")
        except Exception as e:
            print(f"Error reading {filename}: {e}")

In [None]:
for filename in os.listdir(pos_path):
    file_path = os.path.join(pos_path, filename)
    if os.path.isfile(file_path):
        try:
            with open(file_path, 'r') as f:
                content = f.read()
                print(f"Content of {filename}:\n{content}\n---")
        except Exception as e:
            print(f"Error reading {filename}: {e}")

convert text to data frame 

In [None]:
def load_imdb_split(split_dir):
    rows = []
    for label_name in ("neg", "pos"):
        label = 0 if label_name == "neg" else 1
        folder = Path(split_dir) / label_name
        for fp in folder.glob("*.txt"):
            with open(fp, "r", encoding="utf-8", errors="ignore") as f:
                rows.append({"review": f.read(), "label": label})
    return pd.DataFrame(rows)

In [None]:
df = load_imdb_split("aclImdb/train")
if len(df) > 10_000:
    df = df.sample(n=10_000, random_state=42).reset_index(drop=True)

df

clean the data

In [24]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()
HTML_RE   = re.compile(r'<.*?>')                   
URL_RE    = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE  = re.compile(r'\S+@\S+')
NONLETTER = re.compile(r'[^a-z\s]')               
MULTISPACE= re.compile(r'\s+')


In [25]:
def clean_review(text: str) -> str:
    """Basic, readable cleaner for IMDb reviews."""
    if not isinstance(text, str):
        return ""
    x = text.lower()                    #  lowercase
    x = HTML_RE.sub(" ", x)             #  remove HTML tags
    x = URL_RE.sub(" ", x)              #  remove URLs
    x = EMAIL_RE.sub(" ", x)            #  remove emails
    x = NONLETTER.sub(" ", x)           #  drop punctuation, numbers, emojis
    tokens = nltk.word_tokenize(x)      #  tokenize
    tokens = [t for t in tokens
              if t not in stop_words and len(t) > 2]  #  remove stopwords + very short tokens
    lemmas = [lemm.lemmatize(t) for t in tokens]      #  lemmatize (basic, noun-default)
    out = " ".join(lemmas)
    return MULTISPACE.sub(" ", out).strip()

In [27]:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [29]:
df["cleaned_review"] = df["review"].apply(clean_review)
pairs = df.sample(10, random_state=1)[["review", "cleaned_review"]]
for i, row in enumerate(pairs.itertuples(index=False), 1):
    print(f"\n--- Example {i} ---")
    print("Raw:     ", row.review[:200].replace("\n"," ") + ("..." if len(row.review) > 200 else ""))
    print("Cleaned: ", row.cleaned_review)
    


--- Example 1 ---
Raw:      Higher and Higher was one of Rodgers&Hart's lesser Broadway musicals it only had a run of 84 performances on Broadway in 1940. Yet it yielded one of their bigger hits It Never Entered My Mind.<br /><b...
Cleaned:  higher higher one rodgers hart lesser broadway musical run performance broadway yet yielded one bigger hit never entered mind nevertheless except one minor song disgustingly rich entire broadway score scrapped rko bought film right instead whole new score jimmy mchugh harold adamson written mostly accommodate one francis albert sinatra making feature film debut sinatra done vocal cameo previous film take leaf page singing rival bing crosby bing feature film debut big broadcast played bing crosby frank sinatra took role frank sinatra think anyone could done better job chairman board billed third behind star jack haley michele morgan butler scullery maid leon errol fact errol millionaire paid help seven month mainly belly chapter informs staff errol

save clean df

In [30]:
df[["review", "cleaned_review", "label"]].to_csv("imdb_cleaned.csv", index=False, encoding="utf-8")
print("Saved: imdb_cleaned.csv")

Saved: imdb_cleaned.csv


In [31]:
df.head()

Unnamed: 0,review,label,cleaned_review
0,"Silent Night, Deadly Night 5 is the very last ...",0,silent night deadly night last series like par...
1,The idea ia a very short film with a lot of in...,1,idea short film lot information interesting en...
2,"For me, this movie just seemed to fall on its ...",0,movie seemed fall face main problem casting gl...
3,Was this based on a comic-book? A video-game? ...,1,based comic book video game drawing year old n...
4,Caution: May contain spoilers...<br /><br />I'...,1,caution may contain spoiler seen movie time li...


testing manner 

In [32]:
demo = "I LOVED this movie!! <br /><br /> It was amazing :) 10/10"
print("Raw:    ", demo)
print("Cleaned:", clean_review(demo)) 

Raw:     I LOVED this movie!! <br /><br /> It was amazing :) 10/10
Cleaned: loved movie amazing


try to add rating and file name to df

In [34]:
def load_imdb_with_filename_rating(root="aclImdb/train"):
    rows = []
    for label_name in ("neg", "pos"):
        label = 0 if label_name == "neg" else 1
        folder = os.path.join(root, label_name)
        for fn in os.listdir(folder):
            if not fn.endswith(".txt"):
                continue
            fp = os.path.join(folder, fn)
            with open(fp, "r", encoding="utf-8", errors="ignore") as f:
                txt = f.read()
            rating = int(fn.split("_")[-1].split(".")[0])  # e.g., 12345_7.txt -> 7
            rows.append({
                "filename": fn,
                "review": txt,
                "label": label,
                "rating": rating
            })
    return pd.DataFrame(rows)

In [35]:
df = load_imdb_with_filename_rating("aclImdb/train")
print(df.head(), df.columns.tolist())

      filename                                             review  label  \
0      0_3.txt  Story of a man who has unnatural feelings for ...      0   
1  10000_4.txt  Airport '77 starts as a brand new luxury 747 p...      0   
2  10001_4.txt  This film lacked something I couldn't put my f...      0   
3  10002_1.txt  Sorry everyone,,, I know this is supposed to b...      0   
4  10003_1.txt  When I was little my parents took me along to ...      0   

   rating  
0       3  
1       4  
2       4  
3       1  
4       1   ['filename', 'review', 'label', 'rating']


In [36]:
df

Unnamed: 0,filename,review,label,rating
0,0_3.txt,Story of a man who has unnatural feelings for ...,0,3
1,10000_4.txt,Airport '77 starts as a brand new luxury 747 p...,0,4
2,10001_4.txt,This film lacked something I couldn't put my f...,0,4
3,10002_1.txt,"Sorry everyone,,, I know this is supposed to b...",0,1
4,10003_1.txt,When I was little my parents took me along to ...,0,1
...,...,...,...,...
24995,9998_9.txt,"Seeing as the vote average was pretty low, and...",1,9
24996,9999_8.txt,"The plot had some wretched, unbelievable twist...",1,8
24997,999_10.txt,I am amazed at how this movie(and most others ...,1,10
24998,99_8.txt,A Christmas Together actually came before my t...,1,8


combine the new df with the old 

In [42]:
df["cleaned_review"] = df["review"].apply(clean_review)
df[["filename", "cleaned_review", "label", "rating"]].to_csv(
    "imdb_cleaned_with_rating.csv", index=False, encoding="utf-8"
)

In [44]:
df.drop(columns=["review"], inplace=True, errors="ignore")
df = df[["filename", "cleaned_review", "label", "rating"]]
df

Unnamed: 0,filename,cleaned_review,label,rating
0,0_3.txt,story man unnatural feeling pig start opening ...,0,3
1,10000_4.txt,airport start brand new luxury plane loaded va...,0,4
2,10001_4.txt,film lacked something put finger first charism...,0,4
3,10002_1.txt,sorry everyone know supposed art film wow hand...,0,1
4,10003_1.txt,little parent took along theater see interior ...,0,1
...,...,...,...,...
24995,9998_9.txt,seeing vote average pretty low fact clerk vide...,1,9
24996,9999_8.txt,plot wretched unbelievable twist however chemi...,1,8
24997,999_10.txt,amazed movie others average star lower crappy ...,1,10
24998,99_8.txt,christmas together actually came time raised j...,1,8


cleaning 

In [46]:
df = df.dropna(subset=["cleaned_review"])
df = df[df["cleaned_review"].str.strip().astype(bool)]

In [47]:
df = df.drop_duplicates(subset="cleaned_review").reset_index(drop=True)

In [48]:
df

Unnamed: 0,filename,cleaned_review,label,rating
0,0_3.txt,story man unnatural feeling pig start opening ...,0,3
1,10000_4.txt,airport start brand new luxury plane loaded va...,0,4
2,10001_4.txt,film lacked something put finger first charism...,0,4
3,10002_1.txt,sorry everyone know supposed art film wow hand...,0,1
4,10003_1.txt,little parent took along theater see interior ...,0,1
...,...,...,...,...
24895,9998_9.txt,seeing vote average pretty low fact clerk vide...,1,9
24896,9999_8.txt,plot wretched unbelievable twist however chemi...,1,8
24897,999_10.txt,amazed movie others average star lower crappy ...,1,10
24898,99_8.txt,christmas together actually came time raised j...,1,8


In [49]:
df.groupby("label")["rating"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,12430.0,2.217458,1.19022,1.0,1.0,2.0,3.0,4.0
1,12470.0,8.73737,1.161955,7.0,8.0,9.0,10.0,10.0


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["cleaned_review"], df["label"],
    test_size=0.2, stratify=df["label"], random_state=42
)

In [52]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=2)
Xtr = vec.fit_transform(X_train)
Xva = vec.transform(X_val)

clf = LogisticRegression(max_iter=1000)
clf.fit(Xtr, y_train)
pred = clf.predict(Xva)

print("Accuracy:", accuracy_score(y_val, pred))
print(confusion_matrix(y_val, pred))
print(classification_report(y_val, pred, digits=3))

Accuracy: 0.8883534136546185
[[2194  292]
 [ 264 2230]]
              precision    recall  f1-score   support

           0      0.893     0.883     0.888      2486
           1      0.884     0.894     0.889      2494

    accuracy                          0.888      4980
   macro avg      0.888     0.888     0.888      4980
weighted avg      0.888     0.888     0.888      4980



1 for positive and 0 for negative

In [55]:
joblib.dump((vec, clf), "imdb_tfidf_logreg.joblib")

def predict_sentiment(text: str) -> int:
    cleaned = clean_review(text)
    X = vec.transform([cleaned])
    return int(clf.predict(X)[0])

print(predict_sentiment("I loved this movie!!"))   
print(predict_sentiment("Terrible and boring")) 

1
0
