# Clean and save data for final model
Description:  
While fitting the models, we found some data have unnecessary columns or errorneous values that caused some models to err out dispite having cleaned the data. Here, we remove those problematic data again and save the new data to disk.  
We also create data sets without geo info (state, county, city) so that we can test whether the model could be location-agnostic.  
Not all data sets created in this file were used, but we created them anyway just in case we wanted to use them.

## Save Data without Geo info
Drop columns such as state, city, county.  
We do this so that we can train models using data from one state and test using data from the other state.  

In [1]:
import pandas as pd

In [2]:
# load data and clean
X_train = pd.read_csv('../Data/X_train_model2.csv')
y_train = pd.read_csv('../Data/y_train_model2.csv')
X_validate = pd.read_csv('../Data/X_valid_model2.csv')
y_validate = pd.read_csv('../Data/y_valid_model2.csv')

X_train_all = pd.read_csv('../Data/X_train_all.csv')
y_train_all = pd.read_csv('../Data/y_train_all.csv')
X_test = pd.read_csv('../Data/X_test_all.csv')
y_test = pd.read_csv('../Data/y_test_all.csv')


X_train.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
X_validate.drop(columns=['Unnamed: 0'], inplace=True)
y_validate.drop(columns=['Unnamed: 0'], inplace=True)

X_train_all.drop(columns=['Unnamed: 0'], inplace=True)
y_train_all.drop(columns=['Unnamed: 0'], inplace=True)
X_test.drop(columns=['Unnamed: 0'], inplace=True)
y_test.drop(columns=['Unnamed: 0'], inplace=True)


# rename this column which gives an error in LGBM because its name has quotation marks
X_train.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_validate.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

X_train_all.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_test.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [3]:
# remove Geo info
X_test_noGeo = X_test.copy()
X_test_noGeo = X_test_noGeo[X_test_noGeo.columns.drop(list(X_test_noGeo.filter(regex='state_')))]
X_test_noGeo = X_test_noGeo[X_test_noGeo.columns.drop(list(X_test_noGeo.filter(regex='city_')))]
X_test_noGeo = X_test_noGeo[X_test_noGeo.columns.drop(list(X_test_noGeo.filter(regex='county_')))]

In [4]:
# remove Geo info
X_train_all_noGeo = X_train_all.copy()
X_train_all_noGeo = X_train_all_noGeo[X_train_all_noGeo.columns.drop(list(X_train_all_noGeo.filter(regex='state_')))]
X_train_all_noGeo = X_train_all_noGeo[X_train_all_noGeo.columns.drop(list(X_train_all_noGeo.filter(regex='city_')))]
X_train_all_noGeo = X_train_all_noGeo[X_train_all_noGeo.columns.drop(list(X_train_all_noGeo.filter(regex='county_')))]

In [None]:
X_test_noGeo.to_csv('../Data/X_test_noGeo.csv')
X_train_all_noGeo.to_csv('../Data/X_train_all_noGeo.csv')


## Save modified state-level data
We do this so that we can more easily test models on different data sets

In [None]:
# laod data and clean
# training data
X_train_GA = pd.read_csv('../Data/X_train_GA.csv')
y_train_GA = pd.read_csv('../Data/y_train_GA.csv')

X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='state_')))]
X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='city_')))]
X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='county_')))]

X_train_GA.drop(columns=['Unnamed: 0'], inplace=True)
y_train_GA.drop(columns=['Unnamed: 0'], inplace=True)

X_train_GA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

# test data
X_test_GA = pd.read_csv('../Data/X_test_GA.csv')
y_test_GA = pd.read_csv('../Data/y_test_GA.csv')

X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='state_')))]
X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='city_')))]
X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='county_')))]

X_test_GA.drop(columns=['Unnamed: 0'], inplace=True)
y_test_GA.drop(columns=['Unnamed: 0'], inplace=True)

X_test_GA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [5]:
# load data and clean
# training data
X_train_CA = pd.read_csv('../Data/X_train_CA.csv')
y_train_CA = pd.read_csv('../Data/y_train_CA.csv')

X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='state_')))]
X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='city_')))]
X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='county_')))]

X_train_CA.drop(columns=['Unnamed: 0'], inplace=True)
y_train_CA.drop(columns=['Unnamed: 0'], inplace=True)

X_train_CA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

# test data
X_test_CA = pd.read_csv('../Data/X_test_CA.csv')
y_test_CA = pd.read_csv('../Data/y_test_CA.csv')

X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='state_')))]
X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='city_')))]
X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='county_')))]

X_test_CA.drop(columns=['Unnamed: 0'], inplace=True)
y_test_CA.drop(columns=['Unnamed: 0'], inplace=True)

X_test_CA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [None]:
X_train_GA.to_csv('../Data/X_train_GA_noGeo.csv')
X_train_CA.to_csv('../Data/X_train_CA_noGeo.csv')

## Save Data with modified column names to use in LGBM
Some data files have unnecessary columns or columns with errors.  
We removed those columns in the code above. We now save the data files to disk so that we can use them in subsequent modeling steps.

In [None]:
X_train.to_csv('../Data/X_train_for_LGBM.csv')
X_train_all.to_csv('../Data/X_train_all_for_LGBM.csv')
X_test.to_csv('../Data/X_test_for_LGBM.csv')
X_validate.to_csv(('../Data/X_validate_for_LGBM.csv'))