# Data Preprocessing Tools

## Importing the libraries

In [51]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [52]:
# Load csv file
concat_df = pd.read_csv(r"C:\Users\LakshmanNavaneethakr\Downloads\lakshmansierra-git\hotel_review_regression\01-concat_data\pre_post_covid.csv", encoding="latin1")

In [53]:
print(concat_df.shape)

(106655, 61)


In [54]:
print(concat_df.columns)

Index(['Index', 'Hotel Category', 'Hotel Name', 'User Name', 'Location',
       'Contributions', 'helpful votes', 'Rating', 'RatingBinary',
       'Review Heading', 'Review Description', 'Trip Type', 'Date of stay',
       'Year', 'Review_Score', 'Review_Category', 'Aggregated_flag_for_stay',
       'Aggregated_flag_for_food', 'Aggregated_flag_for_clean',
       'Aggregated_flag_for_location', 'Aggregated_flag_for_cost',
       'Aggregated_flag_for_safety', 'Aggregated_flag_for_service',
       'Aggregated_flag_for_decoration', 'Aggregated_flag_for_facility',
       'Aggregated_flag_for_ambience', 'Aggregated_flag_for_room',
       'Total_aggregation_for_match_stay', 'Total_aggregation_for_match_food',
       'Total_aggregation_for_match_clean',
       'Total_aggregation_for_match_location',
       'Total_aggregation_for_match_cost',
       'Total_aggregation_for_match_safety',
       'Total_aggregation_for_match_service',
       'Total_aggregation_for_match_decoration',
       'Total_

## Encode month of occurence

In [55]:
concat_df['Date of stay'] = pd.to_datetime(concat_df['Date of stay'], errors='coerce')
concat_df['Month'] = concat_df['Date of stay'].dt.month

month_dummies = pd.get_dummies(concat_df['Month'], prefix='M', prefix_sep='')

for i in range(1, 13):
    col = f'M{i}'
    if col not in month_dummies.columns:
        month_dummies[col] = 0

month_dummies = month_dummies.reindex(sorted(month_dummies.columns, key=lambda x: int(x[1:])), axis=1)

month_dummies = month_dummies.astype(int)

concat_month_dummies_df = pd.concat([concat_df, month_dummies], axis=1)
concat_month_dummies_df.drop(columns=['Month', 'M1'], inplace=True)


In [56]:
print(concat_month_dummies_df.columns)

Index(['Index', 'Hotel Category', 'Hotel Name', 'User Name', 'Location',
       'Contributions', 'helpful votes', 'Rating', 'RatingBinary',
       'Review Heading', 'Review Description', 'Trip Type', 'Date of stay',
       'Year', 'Review_Score', 'Review_Category', 'Aggregated_flag_for_stay',
       'Aggregated_flag_for_food', 'Aggregated_flag_for_clean',
       'Aggregated_flag_for_location', 'Aggregated_flag_for_cost',
       'Aggregated_flag_for_safety', 'Aggregated_flag_for_service',
       'Aggregated_flag_for_decoration', 'Aggregated_flag_for_facility',
       'Aggregated_flag_for_ambience', 'Aggregated_flag_for_room',
       'Total_aggregation_for_match_stay', 'Total_aggregation_for_match_food',
       'Total_aggregation_for_match_clean',
       'Total_aggregation_for_match_location',
       'Total_aggregation_for_match_cost',
       'Total_aggregation_for_match_safety',
       'Total_aggregation_for_match_service',
       'Total_aggregation_for_match_decoration',
       'Total_

In [57]:
print(concat_month_dummies_df.shape)

(106655, 72)


## Add Review Description length column 

In [58]:
concat_month_dummies_df['review_length'] = (
    concat_month_dummies_df['Review Description']
    .fillna('')                            # Replace NaN with empty string
    .astype(str)                           # Ensure all are strings
    .str.strip()                           # Remove spaces before/after
    .apply(lambda x: len(x) if x else 0)   # Length = len(x) else 0 if empty
)


In [59]:
print(concat_month_dummies_df.columns)

Index(['Index', 'Hotel Category', 'Hotel Name', 'User Name', 'Location',
       'Contributions', 'helpful votes', 'Rating', 'RatingBinary',
       'Review Heading', 'Review Description', 'Trip Type', 'Date of stay',
       'Year', 'Review_Score', 'Review_Category', 'Aggregated_flag_for_stay',
       'Aggregated_flag_for_food', 'Aggregated_flag_for_clean',
       'Aggregated_flag_for_location', 'Aggregated_flag_for_cost',
       'Aggregated_flag_for_safety', 'Aggregated_flag_for_service',
       'Aggregated_flag_for_decoration', 'Aggregated_flag_for_facility',
       'Aggregated_flag_for_ambience', 'Aggregated_flag_for_room',
       'Total_aggregation_for_match_stay', 'Total_aggregation_for_match_food',
       'Total_aggregation_for_match_clean',
       'Total_aggregation_for_match_location',
       'Total_aggregation_for_match_cost',
       'Total_aggregation_for_match_safety',
       'Total_aggregation_for_match_service',
       'Total_aggregation_for_match_decoration',
       'Total_

In [60]:
print(concat_month_dummies_df.shape)

(106655, 73)


## Saving csv

In [61]:
os.makedirs("02-encoded_month", exist_ok=True)
concat_month_dummies_df.to_csv("02-encoded_month/pre_post_covid_month_encoded.csv", index=False)

In [62]:
os.makedirs("02-encoded_month", exist_ok=True)
concat_month_dummies_df.head(50).to_csv("02-encoded_month/pre_post_covid_month_encoded_head.csv", index=False)
concat_month_dummies_df.tail(50).to_csv("02-encoded_month/pre_post_covid_month_encoded_tail.csv", index=False)