In [1]:
import pandas as pd
import itertools
import numpy as np
from cleaner import ErrorCleaner

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
mv_repair_methods = ['delete', 'mean-mode', 'median-mode', 'mode-mode', 'mean-dummy', 'median-dummy', 'mode-dummy']

outlier_detection_methods = ['none', 'SD', 'IQR']
outlier_repair_methods = ['mean', 'median', 'mode']

duplicate_repair_methods = ['NA', 'key_val']

training_list = [mv_repair_methods, outlier_detection_methods, outlier_repair_methods, duplicate_repair_methods]
training_combinations = [p for p in itertools.product(*training_list)]

cleaning_setups_df = pd.DataFrame(training_combinations, columns =['mv_repair', 'outlier_detection', 'outlier_repair', 
                'duplicate_repair'])

cleaning_setups_df['outlier_repair'].mask(cleaning_setups_df['outlier_detection'] == 'none', 'NA', inplace=True)

cleaning_setups_df = cleaning_setups_df.drop_duplicates()

In [3]:
cleaning_setups_df = cleaning_setups_df.reset_index()
cleaning_setups_df = cleaning_setups_df.drop(['index'], axis = 1)

In [4]:
cleaning_setups_df

Unnamed: 0,mv_repair,outlier_detection,outlier_repair,duplicate_repair
0,delete,none,,
1,delete,none,,key_val
2,delete,SD,mean,
3,delete,SD,mean,key_val
4,delete,SD,median,
...,...,...,...,...
93,mode-dummy,IQR,mean,key_val
94,mode-dummy,IQR,median,
95,mode-dummy,IQR,median,key_val
96,mode-dummy,IQR,mode,


In [5]:
airbnb_data = pd.read_csv('airbnb_raw.csv')

In [6]:
cleaning_setup = cleaning_setups_df.loc[0]

In [7]:
cleaning_setup

mv_repair            delete
outlier_detection      none
outlier_repair           NA
duplicate_repair         NA
Name: 0, dtype: object

In [8]:
error_cleaner = ErrorCleaner(airbnb_data, cleaning_setup)
cleaned_dataset = error_cleaner.clean_all(['latitude', 'longitude'])

In [9]:
cleaned_dataset

Unnamed: 0,Bathrooms,Bedrooms,Beds,LocationName,NumGuests,NumReviews,Price,Rating,latitude,longitude,...,Number of Homes,Count of Abnb,Density of Abnb (%),Average Abnb Price (by zipcode),Average NumReviews (by zipcode),Average Rating (by zipcode),Average Number of Bathrooms (by zipcode),Average Number of Bedrooms (by zipcode),Average Number of Beds (by zipcode),Average Number of Guests (by zipcode)
0,1.0,1.0,1.0,Atlanta,2.0,7.0,38.0,Y,33.75515,-84.32992,...,5306.0,75,1.413494,104.743243,35.280702,4.944444,1.459459,1.845070,2.146667,2.146667
1,1.0,1.0,1.0,Atlanta,2.0,15.0,38.0,N,33.82613,-84.33963,...,10537.2,96,0.911058,103.673684,21.169231,4.847458,1.281250,1.473118,1.687500,1.687500
2,2.0,2.0,2.0,Atlanta,4.0,17.0,100.0,Y,33.75076,-84.37058,...,9114.4,200,2.194330,119.368687,40.400000,4.937500,1.375000,1.602094,1.925000,1.925000
3,1.0,1.0,1.0,Atlanta,2.0,304.0,78.0,Y,33.77059,-84.33538,...,7808.0,130,1.664959,119.914729,44.063158,4.892857,1.292308,1.603306,1.953846,1.953846
4,1.0,1.0,1.0,Atlanta,2.0,19.0,50.0,Y,33.79030,-84.40027,...,9343.6,190,2.033477,131.058511,28.444444,4.881679,1.326316,1.580838,1.968421,1.968421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33140,1.0,1.0,2.0,Washington,5.0,159.0,67.0,N,38.90956,-77.03107,...,5300.0,123,2.320755,135.268293,52.009174,4.800000,1.262295,1.453704,2.203252,2.203252
33141,2.0,1.0,4.0,Washington,1.0,28.0,49.0,Y,38.90920,-77.02622,...,5300.0,123,2.320755,135.268293,52.009174,4.800000,1.262295,1.453704,2.203252,2.203252
33142,1.0,1.0,1.0,Washington,2.0,106.0,120.0,Y,38.88746,-76.99119,...,11794.8,410,3.476108,127.748768,54.344444,4.900289,1.196078,1.442708,2.487805,2.487805
33143,1.0,1.0,2.0,Washington,3.0,296.0,70.0,Y,38.88535,-76.98183,...,11794.8,410,3.476108,127.748768,54.344444,4.900289,1.196078,1.442708,2.487805,2.487805
