# Airbnb Datasets Creation

This notebook creates the various cleaned train and test datasets from the Airbnb dataset.

In [1]:
import pandas as pd
import itertools
import numpy as np
from cleaner import ErrorCleaner
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
mv_repair_methods = ['delete', 'mean-mode', 'median-mode', 'mode-mode', 'mean-dummy', 'median-dummy', 'mode-dummy']

outlier_detection_methods = ['none', 'SD', 'IQR']
outlier_repair_methods = ['mean', 'median', 'mode']

duplicate_repair_methods = ['NA', 'key_val']

training_list = [mv_repair_methods, outlier_detection_methods, outlier_repair_methods, duplicate_repair_methods]
training_combinations = [p for p in itertools.product(*training_list)]

cleaning_setups_df = pd.DataFrame(training_combinations, columns =['mv_repair', 'outlier_detection', 'outlier_repair', 
                'duplicate_repair'])

cleaning_setups_df['outlier_repair'].mask(cleaning_setups_df['outlier_detection'] == 'none', 'NA', inplace=True)

cleaning_setups_df = cleaning_setups_df.drop_duplicates()

In [3]:
cleaning_setups_df = cleaning_setups_df.reset_index()
cleaning_setups_df = cleaning_setups_df.drop(['index'], axis = 1)

In [4]:
cleaning_setups_df

Unnamed: 0,mv_repair,outlier_detection,outlier_repair,duplicate_repair
0,delete,none,,
1,delete,none,,key_val
2,delete,SD,mean,
3,delete,SD,mean,key_val
4,delete,SD,median,
...,...,...,...,...
93,mode-dummy,IQR,mean,key_val
94,mode-dummy,IQR,median,
95,mode-dummy,IQR,median,key_val
96,mode-dummy,IQR,mode,


In [5]:
airbnb_data = pd.read_csv('airbnb_raw.csv')

In [6]:
np.random.seed(0)
for i in range(20):    
    target = airbnb_data["Price"].copy()
    features = airbnb_data.drop("Price", axis = 1).copy()
    
    X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.3, random_state = i)

    XY_train = X_train.copy()
    XY_train['Price'] = Y_train

    XY_test = X_test.copy()
    XY_test['Price'] = Y_test
    
    XY_train.to_csv('airbnb_dirty_train_set_' + str(i) + '.csv', index = False)
    XY_test.to_csv('airbnb_dirty_test_set_' + str(i) + '.csv', index = False)
    
    cleaned_train_df = pd.DataFrame(columns = list(cleaning_setups_df.columns) + ['dataset'])
    cleaned_test_df = pd.DataFrame(columns = list(cleaning_setups_df.columns) + ['dataset'])
    
    for j in range(len(cleaning_setups_df)):
        cleaning_setup = cleaning_setups_df.loc[j]
    
        error_cleaner = ErrorCleaner(XY_train, cleaning_setup)
        cleaned_dataset = error_cleaner.clean_all(['latitude', 'longitude'])

        cleaned_df = dict(cleaning_setup)
        cleaned_df['dataset'] = cleaned_dataset

        cleaned_train_df = cleaned_train_df.append(cleaned_df, ignore_index = True)
        
        error_cleaner = ErrorCleaner(XY_test, cleaning_setup)
        cleaned_dataset = error_cleaner.clean_all(['latitude', 'longitude'])

        cleaned_df = dict(cleaning_setup)
        cleaned_df['dataset'] = cleaned_dataset

        cleaned_test_df = cleaned_test_df.append(cleaned_df, ignore_index = True)
        
    cleaned_train_df.to_pickle('airbnb_cleaned_train_df_' + str(i) + '.pkl')
    cleaned_test_df.to_pickle('airbnb_cleaned_test_df_' + str(i) + '.pkl')