## Business Intelligence Assignment 2: Data Analytics

### (2) Data Pre-processing

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

### (a) Loading Data

In [None]:
cwd = os.getcwd()

data_folder = os.path.join(cwd, 'data\\raw_data')

file_path = os.path.join(data_folder, 'AB_US_2023.csv')
df_base = pd.read_csv(file_path, sep=",", dtype={'neighbourhood_group': 'str'}, low_memory=False)

print(df_base.columns)

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'city'],
      dtype='object')


### (b) Processing

In [3]:
def pre_processing(data : pd.DataFrame):
    
    data = pd.read_csv(file_path, sep=",", dtype={'neighbourhood_group': 'str'}, low_memory=False)
    data.drop(columns='host_name', inplace=True)  # Drop useless columns
    data.drop(columns='neighbourhood_group', inplace=True)
    # Fill columns with 0 where there are no reviews
    data[['last_review', 'reviews_per_month']] = data[['last_review', 'reviews_per_month']].fillna(value=0)
    data['name'] = data['name'].fillna(value='-')  # Fill empty descriptions with -
    data_price_outlier = data[data['price'] < 5]  # detect and handle price outliers
    # Apply drop by using index of filtered Dataframe (applying frame on its own does not work, only index works)
    data.drop(data_price_outlier.index, inplace=True)

    # Handling duplicate IDs
    # duplicate Listing IDs resemble an actual listing mistake, will retain the one with more information
    data_duplicate = data[data['id'].duplicated()].index
    data.drop(data_duplicate, inplace=True)
    data.set_index('id', inplace=True)
    
    
    return data


In [4]:
data_cleaned = pre_processing(df_base)

### (c) Saving Processed Data 

In [6]:
# data_cleaned.to_csv(r'C:\TU WIEN\SS25\Data Stewardship\Part 2\data\mid_processing\AB_US_2023_DATA_CLEANED.csv')