# Data cleaning process

In [None]:
#! Import libraries.
import os  # for os related operations
import pandas as pd  # for data manipulation and analysis

from py_modules.data_df_utils import *  # for data manipulation
from py_modules.data_cleaner_utils import *  # for data cleaning

#! Important paths.
path = os.getcwd()  # get current working directory
dataset_path = os.path.join(path, "aclImdb")  # get dataset path
cleaned_dataset_path = os.path.join(path, "cleaned_aclImdb")  # get cleaned dataset path

# Create cleaned dataset directory if it doesn't exist.
if not os.path.exists(cleaned_dataset_path):
    os.mkdir(cleaned_dataset_path)

# Print out the important paths.
important_paths = f"""Important paths:
    Current working directory: {path}
    Dataset path: {dataset_path}
    Cleaned dataset path: {cleaned_dataset_path}
"""
print(important_paths)

In [None]:
#! Load the training data.
train_df = load_data(dataset_path, "train")

#! Visualize the training data.
visualize_data(train_df, "Training data")

#! Print statistics of the training data.
analyze_data(train_df)

In [None]:
#! Clean the training data.
cleaned_train_df = clean_data(train_df)

#! Visualize the cleaned training data.
visualize_data(cleaned_train_df, "Cleaned training data")

#! Print statistics of the cleaned training data.
analyze_data(cleaned_train_df)

#! Compare a random review before and after cleaning.
compare_review_before_after_cleaning(train_df, cleaned_train_df)

#! Save the cleaned training data.
save_cleaned_data(cleaned_train_df, is_train_data=True)

In [None]:
#! Do the same for the test data (cleaning and saving only).
cleaned_test_df = clean_data(load_data(dataset_path, "test"))
save_cleaned_data(cleaned_test_df, is_train_data=False)