## Text Data Preprocessing

In [1]:
# Import data
import pandas as pd
df = pd.read_csv('movie.csv')

In [2]:
df.head()
# Columns: text, label

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
# Check for missing values
print(df.isnull().sum())

text     0
label    0
dtype: int64


In [4]:
# Drop any duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

(40000, 2)
(39723, 2)


In [5]:
# Check for any class imbalances
print(df.info())
print(df['label'].value_counts())
print(df.describe())
# Score: the dataset is balanced and good for ML

<class 'pandas.core.frame.DataFrame'>
Index: 39723 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    39723 non-null  object
 1   label   39723 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 931.0+ KB
None
label
1    19908
0    19815
Name: count, dtype: int64
              label
count  39723.000000
mean       0.501171
std        0.500005
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000


In [6]:
#  Sampling the dataset for computational purposes
sampled_df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(4000))

  sampled_df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(4000))


In [7]:
# Import NLTk and download corpus
import nltk
#nltk.download('all')

In [8]:
# Preprocessing text 
# Imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Will compare lemmatization vs stemming for accuracy
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Functions to process with stemming and lemmatization for testing
def preprocess_text_stem(text):
    text = text.lower() # Lowercase text
    text = re.sub(r'[^a-z0-9\s]', '', text) # Remove punctuation + special characters
    tokens = word_tokenize(text) # Tokenize text
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

def preprocess_text_lemmatize(text):
    text = text.lower() # Lowercase text
    text = re.sub(r'[^a-z0-9\s]', '', text) # Remove punctuation + special characters
    tokens = word_tokenize(text) # Tokenize text
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Perform lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

In [9]:
# Apply the pre-processing functions to the text
sampled_df['stemText'] = sampled_df['text'].apply(preprocess_text_stem)
sampled_df['lemmatizeText'] = sampled_df['text'].apply(preprocess_text_lemmatize)

# Print samples of processed text
print("Stemmed samples")
print(sampled_df['stemText'].head())
print("-"*30)
print("Lemmatized samples")
print(sampled_df['lemmatizeText'].head())

Stemmed samples
2401     jack black charact tim dingman dreamer envi fi...
5732     watch one episod program couldnt even get end ...
26186    went see movi alreadi forc choic origin intent...
5895     director jonathan lynn made underr comedi past...
39975    could believ aw film rare watch commerci tv th...
Name: stemText, dtype: object
------------------------------
Lemmatized samples
2401     jack black character tim dingman dreamer envy ...
5732     watched one episode program couldnt even get e...
26186    went see movie already forced choice original ...
5895     director jonathan lynn made underrated comedy ...
39975    could believe awful film rarely watch commerci...
Name: lemmatizeText, dtype: object


In [10]:
# Save prepared data
sampled_df.to_csv('movies_prepared.csv')

In [10]:
# Split dataset and create word embeddings using TF-IDF
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_stem = sampled_df['stemText']
X_lemmatize = sampled_df['lemmatizeText']
y = sampled_df['label']

# Dataset splits for both stemmed and lemmatized words
# Random state creates the same splits for both datasets, enusring proper comparison
X_train_stem_text, X_test_stem_text, y_train_stem, y_test_stem = train_test_split(
    X_stem, y, test_size=0.2, random_state=4444, stratify=y
)

X_train_lem_text, X_test_lem_text, y_train_lem, y_test_lem = train_test_split(
    X_lemmatize, y, test_size=0.2, random_state=4444, stratify=y
)

# Fit the TF-IDF Vectorizer
tfidf_stem = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
tfidf_lem = TfidfVectorizer(max_features=20000, ngram_range=(1,2))

X_train_stem = tfidf_stem.fit_transform(X_train_stem_text)
X_test_stem = tfidf_stem.transform(X_test_stem_text)

X_train_lem = tfidf_lem.fit_transform(X_train_lem_text)
X_test_lem = tfidf_lem.transform(X_test_lem_text)

In [12]:
# Export vectors and labels
import joblib

# Save features
joblib.dump(X_train_stem, "X_train_stem.pkl", compress=3)
joblib.dump(X_test_stem, "X_test_stem.pkl", compress=3)
joblib.dump(X_train_lem, "X_train_lem.pkl", compress=3)
joblib.dump(X_test_lem, "X_test_lem.pkl", compress=3)

# Save labels 
joblib.dump(y_train_stem, "y_train_stem.pkl")
joblib.dump(y_test_stem, "y_test_stem.pkl")
joblib.dump(y_train_lem, "y_train_lem.pkl")
joblib.dump(y_test_lem, "y_test_lem.pkl")

# Save vectorizers
joblib.dump(tfidf_stem, "vectorizer_stem.pkl")
joblib.dump(tfidf_lem, "vectorizer_lem.pkl")


['vectorizer_lem.pkl']