In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import re

### We filtered out English review comments from all comments in language_filtering.ipynb

we will be cleaning our English review data set in this script which will be used later for analysis

In [None]:
INPUT_ALL_ENGLISH_DATA_FILEPATH = "./../filtered_data/english_data.csv.gz"
REGEX_CLEANING_PATTERN = "[^a-zA-Z\s]+"

In [None]:
all_english_reviews_df = pd.read_csv(INPUT_ALL_ENGLISH_DATA_FILEPATH, low_memory=False, dtype={'title': str, 'text' : str})

# all_english_reviews_df
# 147034 rows × 35 columns

In [None]:
# list(all_english_reviews_df.columns)

### Drop unwanted columns from the dataframe and extract required information from the columns of dataframe

In [None]:
# dropping some columns that won't be needed for future analyses
cols_to_drop = ['num_helpful_votes', 
                'date', 
                'via_mobile', 
                'username', 
                'num_cities', 
                'num_reviews', 
                'num_type_reviews', 
                'id.1',  
                'region_id', 
                'phone', 
                'details', 
                'type',
                'num_helpful_votes.1',
                'location.1',
               ]
all_english_reviews_df = all_english_reviews_df.drop(columns = cols_to_drop)

# Note: If there is a column name not found error while running this, check the list of column names on originally read df.
# all_english_reviews_df

In [None]:
# Split up the date_stayed column into month and year (just keep month as month_stayed)
all_english_reviews_df['month_stayed'] = all_english_reviews_df['date_stayed'].str.split(' ').str[0]
all_english_reviews_df = all_english_reviews_df.drop(columns = ['date_stayed'])

### Add review column to the dataframe based on overall rating which forms a review basis for the hotels

In [None]:
# NaN value check for overall column based on which review column is determined 
overall_nan_check = all_english_reviews_df['overall'].isnull().values.any()
print(overall_nan_check) # expecting false

In [None]:
# adding a column called "review" for the categorizing customer reviews as positive/negative
all_english_reviews_df['review'] = all_english_reviews_df['overall'].apply(lambda x: "positive" if x >= 3 else "negative")

# all_english_reviews_df

### Performing text cleaning on review comments 

In [None]:
# # Testing
# clean_text_test_pd = all_english_reviews_df.sample(n = 100, replace = False)
# clean_text_test_pd

In [None]:
#Remove all non-alphanumeric characters and newline characters except spaces\n",
def clean_strings(value):
    val = ''

    # Remove extra whitespaces
    val = val.join(value.splitlines())

    # print(val)
    # Used regex that eliminates all special characters and digits
    cleaned_str = re.sub(REGEX_CLEANING_PATTERN, "", val)

    # convert to lower case
    cleaned_str = cleaned_str.lower()
    
    return cleaned_str

In [None]:
column_names = ['title','text']
all_english_reviews_df[column_names] = all_english_reviews_df[column_names].applymap(clean_strings)
# all_english_reviews_df

### Export cleaned English review of all hotels

In [None]:
# all_english_reviews_df contains cleaned data that will be used for machine learning
all_english_reviews_df.to_csv('./../filtered_data/all_cities_cleaned_english_reviews.csv.gz', index = False, compression="gzip")

### Next we will perform Machine Learning Analysis on cleaned English review dataset. 

We will split this cleaned data into training and prediction dataset based on user input city

User input city would the the city for which they want recommendations. Our reviews data will split such that hotels reviews for the hotels located in user inputed city forms perdiction dataset and rest of the reviews for the hotel in other cities forms training dataset.

Machine learning models are written in **machine_learning.ipynb** script