# Cleaning Effects on Distributions

This notebook briefly analyses the effects of the cleaning pipelines on distributions in the Airbnb data.

In [1]:
import pandas as pd
import itertools
import numpy as np
from cleaner import ErrorCleaner
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import jensenshannon
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
def collect_results(dataset_name, model):
    results = pd.read_csv(dataset_name + '_' + model + '_results.csv')
    
    mean_df = results.groupby(['train_cleaning', 'test_cleaning'], as_index = False).mean(numeric_only=True)
    mean_df = mean_df.drop(['train_test_split'], axis = 1)

    std_df = results.groupby(['train_cleaning', 'test_cleaning'], as_index = False).std(numeric_only=True)
    std_df = results.drop(['train_test_split'], axis = 1)

    return results, mean_df, std_df

In [3]:
airbnb_rfr_results, airbnb_rfr_mean_df, airbnb_rfr_std_df = collect_results('airbnb', 'rfr')
airbnb_gbr_results, airbnb_gbr_mean_df, airbnb_gbr_std_df = collect_results('airbnb', 'gbr')
airbnb_xgb_results, airbnb_xgb_mean_df, airbnb_xgb_std_df = collect_results('airbnb', 'xgb')

## Price distribution

The following code highlights how the cleaning pipelines can affect the target ('Price') variable's distribution.

In [4]:
airbnb_data = pd.read_csv('airbnb_raw.csv')

mv_repair_methods = ['delete', 'mean-mode', 'median-mode', 'mode-mode']

outlier_detection_methods = ['none', 'SD', 'IQR']
outlier_repair_methods = ['mean', 'median', 'mode']

duplicate_repair_methods = ['NA', 'key_val']

training_list = [mv_repair_methods, outlier_detection_methods, outlier_repair_methods, duplicate_repair_methods]
training_combinations = [p for p in itertools.product(*training_list)]

cleaning_setups_df = pd.DataFrame(training_combinations, columns =['mv_repair', 'outlier_detection', 'outlier_repair', 
                'duplicate_repair'])

cleaning_setups_df['outlier_repair'].mask(cleaning_setups_df['outlier_detection'] == 'none', 'NA', inplace=True)

cleaning_setups_df = cleaning_setups_df.drop_duplicates()

cleaning_setups_df = cleaning_setups_df.reset_index()
cleaning_setups_df = cleaning_setups_df.drop(['index'], axis = 1)

airbnb_data_subset = airbnb_data[['Price', 'latitude', 'longitude']]

all_cleaned_dfs = pd.DataFrame(columns = list(cleaning_setups_df.columns) + ['dataset'])

for j in range(len(cleaning_setups_df)):
    cleaning_setup = cleaning_setups_df.loc[j]

    error_cleaner = ErrorCleaner(airbnb_data_subset, cleaning_setup)
    cleaned_dataset = error_cleaner.clean_all(['latitude', 'longitude'])

    cleaned_df = dict(cleaning_setup)
    cleaned_df['dataset'] = cleaned_dataset
    
    all_cleaned_dfs = all_cleaned_dfs.append(cleaned_df, ignore_index = True)

AttributeError: 'DataFrame' object has no attribute 'append'

In [5]:
# MVs deleted, outliers and duplicates unaddressed
ax = sns.histplot(all_cleaned_dfs.dataset.loc[0].Price.values, bins = 100, element = 'step')
ax.set(xlabel='Price', ylabel='Count')
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
ax.tick_params(axis='both', labelsize=16)
plt.show()

KeyError: 0

In [None]:
# missing values filled with mean, outliers detected using standard deviation and replaced with mean, duplicates unaddressed
ax = sns.histplot(all_cleaned_dfs.dataset.loc[16].Price.values, bins = 100, element = 'step')
ax.set(xlabel='Price', ylabel='Count')
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
ax.tick_params(axis='both', labelsize=16)
plt.show()

In [None]:
# MVs replaced with median, outliers detected with IQR and replaced with median, duplicates deleted
ax = sns.histplot(all_cleaned_dfs.dataset.loc[39].Price.values, bins = 100, element = 'step')
ax.set(xlabel='Price', ylabel='Count')
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
ax.tick_params(axis='both', labelsize=16)
plt.show()

## Differences in Distributions

The following code shows the differences in variable distributions (based on Jensen Shannon distance) that can arise through different training and test cleaning pipelines.

In [None]:
def dist_calc(columns, training_df, test_df):
    dist_dic = {}
    for column in columns:
        training_vals = training_df[column].values
        test_vals = test_df[column].values

        testing_dist,bins_numpy = np.histogram(test_vals,bins=100)
        training_dist,bins2 = np.histogram(training_vals,bins=bins_numpy)
    
        js_dist = jensenshannon(testing_dist,training_dist)

        dist_dic = dist_dic | {column:js_dist}
    return dist_dic

In [None]:
columns_to_test = ['Price', 'NumReviews', 'cost_living_index (US avg. = 100)']

In [None]:
distribution_distances_df = pd.DataFrame(columns=['train_test_split', 'training_pipeline', 'test_pipeline'] + columns_to_test)

for train_test_split in range(20):
    all_training_data = pd.read_pickle('airbnb_cleaned_train_df_' + str(train_test_split) + '.pkl')
    all_testing_data = pd.read_pickle('airbnb_cleaned_test_df_' + str(train_test_split) + '.pkl')

    for i in range(len(all_training_data)):
        training_df = all_training_data.dataset.loc[i]
        
        for j in range(len(all_testing_data)):
            test_df = all_testing_data.dataset.loc[j]
        
            dist_dict = dist_calc(columns_to_test, training_df, test_df)

            distribution_distances_df = distribution_distances_df.append({'train_test_split': train_test_split, 
                                                                          'training_pipeline': i, 'test_pipeline': j} | dist_dict,
                                                                         ignore_index=True)

In [None]:
#distribution_distances_df.to_csv('cleaning_effects_distributions.csv')
distribution_distances_df = pd.read_csv('cleaning_effects_distributions.csv')
distribution_distances_df = distribution_distances_df.drop('Unnamed: 0', axis = 1)

In [None]:
distribution_distances_df['average_dist'] = distribution_distances_df[['Price', 'NumReviews', 'cost_living_index (US avg. = 100)']].mean(axis=1)
distribution_distances_df = distribution_distances_df.groupby(['training_pipeline', 'test_pipeline']).mean().reset_index()
distribution_distances_df = distribution_distances_df.drop(['train_test_split'], axis = 1)

#### Training cleaning pipelines that minimise distribution differences:

In [None]:
# using target variable only
distribution_distances_df.groupby(['training_pipeline']).mean().sort_values('Price')

In [None]:
# using target plus features
distribution_distances_df.groupby(['training_pipeline']).mean().sort_values('average_dist')

In [None]:
# best performing training pipelines for model performance
airbnb_rfr_mean_df.groupby(['train_cleaning']).mean().sort_values('score')

In [None]:
airbnb_gbr_mean_df.groupby(['train_cleaning']).mean().sort_values('score')

In [None]:
airbnb_xgb_mean_df.groupby(['train_cleaning']).mean().sort_values('score')

#### Best training pipelines for given test pipeline:

In [None]:
# using target variable only
best_training = distribution_distances_df.loc[distribution_distances_df.groupby('test_pipeline').Price.idxmin()]

In [None]:
best_training.loc[best_training.training_pipeline == best_training.test_pipeline]

In [None]:
np.unique(best_training.training_pipeline.values)

In [None]:
len(np.unique(best_training.training_pipeline.values))

In [None]:
# using target plus features
best_training = distribution_distances_df.loc[distribution_distances_df.groupby('test_pipeline').average_dist.idxmin()]

In [None]:
best_training.loc[best_training.training_pipeline == best_training.test_pipeline]

In [None]:
np.unique(best_training.training_pipeline.values)

In [None]:
len(np.unique(best_training.training_pipeline.values))

In [None]:
# best pipelines for model performance

In [None]:
best_pipelines = airbnb_rfr_mean_df.loc[airbnb_rfr_mean_df.groupby('test_cleaning')['score'].idxmax()]
np.unique(best_pipelines.train_cleaning.values)

In [None]:
best_pipelines = airbnb_gbr_mean_df.loc[airbnb_gbr_mean_df.groupby('test_cleaning')['score'].idxmax()]
np.unique(best_pipelines.train_cleaning.values)

In [None]:
best_pipelines = airbnb_xgb_mean_df.loc[airbnb_xgb_mean_df.groupby('test_cleaning')['score'].idxmax()]
np.unique(best_pipelines.train_cleaning.values)