In [53]:
from pathlib import Path
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\osamasaid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\osamasaid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\osamasaid\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
def load_reviews(folder_path, category, limit=None):
    folder = Path(folder_path)

    files = sorted(folder.glob("*.txt"), key=lambda p: int(p.stem.split("_")[0]))

    rows = []
    for i, file in enumerate(files):
        if limit and i >= limit:   
            break

        file_id, rating = file.stem.split("_")
        review = file.read_text(encoding="utf-8", errors="replace").strip()

        rows.append({
            "id": int(file_id),
            "rating": int(rating),
            "review": review,
            "category": category
        })

    return pd.DataFrame(rows, columns=["id", "rating", "review", "category"]) \
             .sort_values("id") \
             .reset_index(drop=True)

In [None]:
positive_df = load_reviews("Data/pos", "positive", limit=5000)
print(positive_df.head())

In [None]:
negative_df = load_reviews("Data/neg", "negative", limit=5000)
print(negative_df.head())

In [None]:
pos_df = pd.DataFrame(positive_df)
pos_df

In [None]:
neg_df = pd.DataFrame(negative_df)
neg_df

In [66]:
df = pd.concat([pos_df, neg_df], ignore_index=True)
df

Unnamed: 0,id,rating,review,category
0,0,9,Bromwell High is a cartoon comedy. It ran at t...,positive
1,1,7,"If you like adult comedy cartoons, like South ...",positive
2,2,9,Bromwell High is nothing short of brilliant. E...,positive
3,3,10,"""All the world's a stage and its people actors...",positive
4,4,8,FUTZ is the only show preserved from the exper...,positive
...,...,...,...,...
9995,4995,3,I found it a real task to sit through this fil...,negative
9996,4996,3,I really enjoyed the first half hour of this m...,negative
9997,4997,2,...but it's certainly not without merit. Alrea...,negative
9998,4998,3,this one of the best celebrity's reality shows...,negative


# Cleaning Steps:

### Convert text to lowercase:

In [67]:
df['review'] = df['review'].str.lower()

In [68]:
df['review'].unique()

array(['bromwell high is a cartoon comedy. it ran at the same time as some other programs about school life, such as "teachers". my 35 years in the teaching profession lead me to believe that bromwell high\'s satire is much closer to reality than is "teachers". the scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools i knew and their students. when i saw the episode in which a student repeatedly tried to burn down the school, i immediately recalled ......... at .......... high. a classic line: inspector: i\'m here to sack one of your teachers. student: welcome to bromwell high. i expect that many adults of my age think that bromwell high is far fetched. what a pity that it isn\'t!',
       "if you like adult comedy cartoons, like south park, then this is nearly a similar format about the small adventures of three teenage girls at bromwell high. keisha, natell

### URL's and email checking:

##### Email checking:

In [78]:
email_mask = df['review'].str.contains(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', regex=True)
emails_found = df[email_mask]

print("Number of emails:", email_mask.sum())

Number of emails: 0


##### URL's checking:

In [79]:
url_mask = df['review'].str.contains(r'http[s]?://\S+|www\.\S+', regex=True)
urls_found = df[url_mask]

print("Number of links:", url_mask.sum())

Number of links: 0


### Remove HTML tags, symbols, punctuation, numbers and emojis:

In [71]:
df['review'] = df['review'].str.replace(r'<[^<>]*>', '', regex=True) # Remove HTML tags
df['review'] = df['review'].str.replace(r'[^A-Za-z\s]', '', regex=True) # Remove all possible symbols

In [72]:
df['review'].unique()

array(['bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my  years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
       'if you like adult comedy cartoons like south park then this is nearly a similar format about the small adventures of three teenage girls at bromwell high keisha natella and latrina have given exploding sweets and behaved 

### Remove stopwords:

In [73]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Split → remove stopwords → join back

df['review'] = df['review'].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osamasaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Perform lemmatization and keep the words len are > 2:

In [74]:
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize_spacy(text):
    if not isinstance(text, str): return ""
    doc = nlp(text.lower())
    return " ".join([t.lemma_ for t in doc if not t.is_punct and not t.is_space])

df['review_lemmatized_len > 2'] = df['review'].fillna("").apply(lemmatize_spacy)


In [76]:
df['review_lemmatized_len > 2'] = df['review_lemmatized_len > 2'].apply(
    lambda x: " ".join([word for word in x.split() if len(word) > 2])
)

In [77]:
df

Unnamed: 0,id,rating,review,category,review_lemmatized_len > 2
0,0,9,bromwell high cartoon comedy ran time programs...,positive,bromwell high cartoon comedy run time program ...
1,1,7,like adult comedy cartoons like south park nea...,positive,like adult comedy cartoon like south park near...
2,2,9,bromwell high nothing short brilliant expertly...,positive,bromwell high nothing short brilliant expertly...
3,3,10,worlds stage people actors itor something like...,positive,world stage people actor itor something like h...
4,4,8,futz show preserved experimental theatre movem...,positive,futz show preserve experimental theatre moveme...
...,...,...,...,...,...
9995,4995,3,found real task sit film sound track best acce...,negative,find real task sit film sound track good accen...
9996,4996,3,really enjoyed first half hour movie wow turn ...,negative,really enjoy first half hour movie wow turn co...
9997,4997,2,certainly without merit already writerdirector...,negative,certainly without merit already writerdirector...
9998,4998,3,one best celebritys reality shows ever saw see...,negative,one good celebritys reality show ever see see ...
