In [64]:
import pandas as pd
import os
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## 1. Data Loading

In [52]:
review = []
label = []
positive_file = os.path.join(r"aclImdb_v1/aclImdb/train", 'pos')
for i in os.listdir(positive_file):
    with open(os.path.join(positive_file, i), 'r', encoding='utf-8') as file:
        review.append(file.read())
        label.append(1)

negative_file = os.path.join(r"aclImdb_v1/aclImdb/train", 'neg')
for file_name in os.listdir(negative_file):
    with open(os.path.join(negative_file, file_name), 'r', encoding='utf-8') as file:
        review.append(file.read())
        label.append(0)
train_data = pd.DataFrame({'review': review, 'label': label})
train_data

Unnamed: 0,review,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


## 2. Text Cleaning Pipeline

In [55]:
def clean_review(text):
    text = text.lower()
    text = re.sub(r'<.*?>','', text)
    text = re.sub(r'http\S+|www\S+|https\S+|@\S+','', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    stopword = stopwords.words('english')
    words = text.split()
    words = [word for word in words if word not in stopword]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if len(word) > 2]
    return ' '.join(words)

## 3. Apply Cleaning

In [56]:
train_data['cleaned_review'] = train_data['review'].apply(clean_review)

In [57]:
train_data.to_csv('cleaned_train_reviews.csv', index=False)

## 4. Evaluation

In [59]:
for i in range(5):
    print(f"Raw: {train_data['review'].iloc[i]}\nCleaned: {train_data['cleaned_review'].iloc[i]}\n")

Raw: bromwell high is a cartoon comedy. it ran at the same time as some other programs about school life, such as "teachers". my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is "teachers". the scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools i knew and their students. when i saw the episode in which a student repeatedly tried to burn down the school, i immediately recalled ......... at .......... high. a classic line: inspector: i'm here to sack one of your teachers. student: welcome to bromwell high. i expect that many adults of my age think that bromwell high is far fetched. what a pity that it isn't!
Cleaned: bromwell high cartoon comedy ran time program school life teacher year teaching profession lead believe bromwell high satire much closer reality teacher scramble survive financial

## 5. Next Steps (Optional)

In [62]:
X_train = train_data['cleaned_review']
y_train = train_data['label']

In [65]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)
y_pred_train = clf.predict(X_train_vec)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

Training Accuracy: 0.92676
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     12500
           1       0.94      0.91      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000

