# 0.0 Imports and Functions

In [1]:
import pandas as pd
import numpy as np

In [2]:
def unknown_replace_nans(df, df_att):
    
    """
    With this function we can replace NaN values on columns that take -1 as unknown values
    
    Input:
        df - DataFrame that will have NAs filled
        df_att - Description DataFrame to check which columns have the unkown meaning attributes
        
    Output:
        df - DataFrame with columns with fillna done
        
    """
    # columns that has attributes unknown = -1 
    unknown_att_cols = df_att[df_att['Meaning'] == 'unknown']['Attribute'].tolist()
    
    #columns we can replace NaN for unknown(-1)
    unkown_cols = np.intersect1d(unknown_att_cols, df.columns.tolist())
    
    for col in unkown_cols:
        df[col] = df[col].fillna(-1)
    
    return df

# 1.0 Reading data

## 1.1 Demographics data

In [2]:
azdias = pd.read_csv('data/Udacity_AZDIAS_052018.csv').drop('Unnamed: 0', axis=1)
customers = pd.read_csv('data/Udacity_CUSTOMERS_052018.csv').drop('Unnamed: 0', axis=1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
azdias.head()

Unnamed: 0,LNR,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,...,VHN,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,910215,-1,,,,,,,,,...,,,,,,,,3,1,2
1,910220,-1,9.0,0.0,,,,,21.0,11.0,...,4.0,8.0,11.0,10.0,3.0,9.0,4.0,5,2,1
2,910225,-1,9.0,17.0,,,,,17.0,10.0,...,2.0,9.0,9.0,6.0,3.0,9.0,2.0,5,2,3
3,910226,2,1.0,13.0,,,,,13.0,1.0,...,0.0,7.0,10.0,11.0,,9.0,7.0,3,2,4
4,910241,-1,1.0,20.0,,,,,14.0,3.0,...,2.0,3.0,5.0,4.0,2.0,9.0,3.0,4,1,3


## 1.2 Description data

In [5]:
df_att = pd.read_excel('data/attributes_description/DIAS Attributes - Values 2017.xlsx', header=1).drop('Unnamed: 0', axis=1)
df_info = pd.read_excel('data/attributes_description/DIAS Information Levels - Attributes 2017.xlsx', header=1).drop('Unnamed: 0', axis=1)

In [6]:
df_att.head()

Unnamed: 0,Attribute,Description,Value,Meaning
0,AGER_TYP,best-ager typology,-1,unknown
1,,,0,no classification possible
2,,,1,passive elderly
3,,,2,cultural elderly
4,,,3,experience-driven elderly


# 2.0 Data Wrangling

## 2.1 Handling Dtypes

In [7]:
#checking columns that got DtypeWarning while reading
columns_warning = azdias.iloc[:, 18:20].columns.tolist()

for c in columns_warning:
    print(f"{c}: {azdias[c].unique()}")

CAMEO_DEUG_2015: [nan 8.0 4.0 2.0 6.0 1.0 9.0 5.0 7.0 3.0 '4' '3' '7' '2' '8' '9' '6' '5'
 '1' 'X']
CAMEO_INTL_2015: [nan 51.0 24.0 12.0 43.0 54.0 22.0 14.0 13.0 15.0 33.0 41.0 34.0 55.0 25.0
 23.0 31.0 52.0 35.0 45.0 44.0 32.0 '22' '24' '41' '12' '54' '51' '44'
 '35' '23' '25' '14' '34' '52' '55' '31' '32' '15' '13' '43' '33' '45'
 'XX']


**Before handling dtypes, le'ts check the attributes datasets to understand how we can handle this**

In [8]:
df_att.head()

Unnamed: 0,Attribute,Description,Value,Meaning
0,AGER_TYP,best-ager typology,-1,unknown
1,,,0,no classification possible
2,,,1,passive elderly
3,,,2,cultural elderly
4,,,3,experience-driven elderly


In [9]:
df_info.head()

Unnamed: 0,Information level,Attribute,Description,Additional notes
0,,AGER_TYP,best-ager typology,in cooperation with Kantar TNS; the informatio...
1,Person,ALTERSKATEGORIE_GROB,age through prename analysis,modelled on millions of first name-age-referen...
2,,ANREDE_KZ,gender,
3,,CJT_GESAMTTYP,Customer-Journey-Typology relating to the pref...,"relating to the preferred information, marketi..."
4,,FINANZ_MINIMALIST,financial typology: low financial interest,Gfk-Typology based on a representative househo...


In [10]:
#Using foward fillna to fill the rows
df_att.fillna(method="ffill", inplace=True)
df_att.head()

Unnamed: 0,Attribute,Description,Value,Meaning
0,AGER_TYP,best-ager typology,-1,unknown
1,AGER_TYP,best-ager typology,0,no classification possible
2,AGER_TYP,best-ager typology,1,passive elderly
3,AGER_TYP,best-ager typology,2,cultural elderly
4,AGER_TYP,best-ager typology,3,experience-driven elderly


In [11]:
df_info.fillna(method="ffill", inplace=True)
df_info.head()

Unnamed: 0,Information level,Attribute,Description,Additional notes
0,,AGER_TYP,best-ager typology,in cooperation with Kantar TNS; the informatio...
1,Person,ALTERSKATEGORIE_GROB,age through prename analysis,modelled on millions of first name-age-referen...
2,Person,ANREDE_KZ,gender,modelled on millions of first name-age-referen...
3,Person,CJT_GESAMTTYP,Customer-Journey-Typology relating to the pref...,"relating to the preferred information, marketi..."
4,Person,FINANZ_MINIMALIST,financial typology: low financial interest,Gfk-Typology based on a representative househo...


In [12]:
#we still have a NaN on the df_info, let's use back fill now
df_info.fillna(method="bfill", inplace=True)
df_info.head()

Unnamed: 0,Information level,Attribute,Description,Additional notes
0,Person,AGER_TYP,best-ager typology,in cooperation with Kantar TNS; the informatio...
1,Person,ALTERSKATEGORIE_GROB,age through prename analysis,modelled on millions of first name-age-referen...
2,Person,ANREDE_KZ,gender,modelled on millions of first name-age-referen...
3,Person,CJT_GESAMTTYP,Customer-Journey-Typology relating to the pref...,"relating to the preferred information, marketi..."
4,Person,FINANZ_MINIMALIST,financial typology: low financial interest,Gfk-Typology based on a representative househo...


In [13]:
#columns with dtype warnings
columns_warning

['CAMEO_DEUG_2015', 'CAMEO_INTL_2015']

In [14]:
#checking description for the columns with dtype warnings
df_att[df_att['Attribute'] == 'CAMEO_DEUG_2015']

Unnamed: 0,Attribute,Description,Value,Meaning
51,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,-1,unknown
52,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,1,upper class
53,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,2,upper middleclass
54,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,3,established middleclasse
55,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,4,consumption-oriented middleclass
56,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,5,active middleclass
57,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,6,low-consumption middleclass
58,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,7,lower middleclass
59,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,8,working class
60,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,9,urban working class


We can see that we can use -1 for unkown values

In [15]:
df_att[df_att['Attribute'] == 'CAMEO_INTL_2015']

Unnamed: 0,Attribute,Description,Value,Meaning


In [16]:
df_info[df_info['Attribute'] == 'CAMEO_INTL_2015']

Unnamed: 0,Information level,Attribute,Description,Additional notes


In [17]:
#No column matched "CAMEO_INTL_2015" found on the demographics, let's see if it has a different name on the description
df_att[df_att['Attribute'].str.contains("INTL")]

Unnamed: 0,Attribute,Description,Value,Meaning
105,CAMEO_DEUINTL_2015,CAMEO classification 2015 - international typo...,-1,unknown
106,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,11,Wealthy Households-Pre-Family Couples & Singles
107,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,12,Wealthy Households-Young Couples With Children
108,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,13,Wealthy Households-Families With School Age Ch...
109,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,14,Wealthy Households-Older Families & Mature Co...
110,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,15,Wealthy Households-Elders In Retirement
111,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,21,Prosperous Households-Pre-Family Couples & Sin...
112,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,22,Prosperous Households-Young Couples With Children
113,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,23,Prosperous Households-Families With School Age...
114,CAMEO_DEUINTL_2015,(each German CAMEO code belongs to one interna...,24,Prosperous Households-Older Families & Mature ...


In [18]:
columns_warning

['CAMEO_DEUG_2015', 'CAMEO_INTL_2015']

In [19]:
#replacing X in the "CAMEO_DEUG_2015" column
azdias['CAMEO_DEUG_2015'] = azdias['CAMEO_DEUG_2015'].replace('X', -1).astype(float)
customers['CAMEO_DEUG_2015'] = customers['CAMEO_DEUG_2015'].replace('X', -1).astype(float) 

#replacing XX in the "CAMEO_DEUG_2015" column
azdias['CAMEO_INTL_2015'] = azdias['CAMEO_INTL_2015'].replace('XX', -1).astype(float)
customers['CAMEO_INTL_2015'] = customers['CAMEO_INTL_2015'].replace('XX', -1).astype(float) 

## 2.2 Chaning NaN for unknown

In [20]:
azdias = unknown_replace_nans(azdias, df_att)
customers = unknown_replace_nans(customers, df_att)

## 2.3 Handling Missing values

In [21]:
#checking missing values with 40%+ from azdias
missing_azdias = azdias.isnull().sum() / azdias.shape[0]
missing_azdias = missing_azdias[missing_azdias > .4].sort_values(ascending=False).index.tolist()
missing_azdias

['ALTER_KIND4',
 'ALTER_KIND3',
 'ALTER_KIND2',
 'ALTER_KIND1',
 'EXTSEL992',
 'KK_KUNDENTYP']

In [22]:
#Let's drop these 40%+ columns
azdias = azdias.drop(columns=missing_azdias)

In [23]:
#checking missing values with 40%+ from customers
missing_customers = customers.isnull().sum() / customers.shape[0]
missing_customers = missing_customers[missing_customers > .4].sort_values(ascending=False).index.tolist()
missing_customers

['ALTER_KIND4',
 'ALTER_KIND3',
 'ALTER_KIND2',
 'ALTER_KIND1',
 'KK_KUNDENTYP',
 'EXTSEL992']

In [24]:
#Let's drop these 40%+ columns
customers = customers.drop(columns=missing_customers)

In [25]:
missing_azdias = azdias.isnull().sum() / azdias.shape[0]
missing_azdias[missing_azdias > .1].sort_values(ascending=False)

ALTERSKATEGORIE_FEIN           0.295041
D19_LOTTO                      0.288495
D19_VERSI_ONLINE_QUOTE_12      0.288495
D19_VERSAND_ONLINE_QUOTE_12    0.288495
D19_TELKO_ONLINE_QUOTE_12      0.288495
D19_SOZIALES                   0.288495
D19_BANKEN_ONLINE_QUOTE_12     0.288495
D19_GESAMT_ONLINE_QUOTE_12     0.288495
D19_KONSUMTYP                  0.288495
D19_LETZTER_KAUF_BRANCHE       0.288495
MOBI_REGIO                     0.149597
VHN                            0.135989
PLZ8_BAUMAX                    0.130736
HH_DELTA_FLAG                  0.120735
KBA13_ANTG2                    0.118714
KBA13_KMH_210                  0.118714
KBA13_HHZ                      0.118714
KBA13_GBZ                      0.118714
KBA13_CCM_1401_2500            0.118714
KBA13_ANZAHL_PKW               0.118714
KBA13_ANTG4                    0.118714
KBA13_ANTG3                    0.118714
KBA13_BAUMAX                   0.118714
KBA13_ANTG1                    0.118714
CAMEO_DEU_2015                 0.111060


In [26]:
missing_customers = customers.isnull().sum() / customers.shape[0]
missing_customers[missing_customers > .1].sort_values(ascending=False)

MOBI_REGIO                     0.292092
VHN                            0.283117
HH_DELTA_FLAG                  0.280415
PLZ8_BAUMAX                    0.275312
ANZ_HH_TITEL                   0.271899
ALTERSKATEGORIE_FEIN           0.270501
KBA13_BAUMAX                   0.267574
KBA13_ANZAHL_PKW               0.267574
KBA13_ANTG4                    0.267574
KBA13_ANTG3                    0.267574
KBA13_ANTG2                    0.267574
KBA13_HHZ                      0.267574
KBA13_KMH_210                  0.267574
KBA13_GBZ                      0.267574
KBA13_ANTG1                    0.267574
KBA13_CCM_1401_2500            0.267574
VERDICHTUNGSRAUM               0.263373
ARBEIT                         0.263373
GEMEINDETYP                    0.263373
STRUKTURTYP                    0.263373
UMFELD_ALT                     0.263227
UMFELD_JUNG                    0.263227
CAMEO_INTL_2015                0.263123
CAMEO_DEU_2015                 0.263123
KONSUMZELLE                    0.260509


### 2.3.1 Handling columns with 40%- missing values

In [27]:
#Let's input 0 where it means unknown
cols_input_zero = ['ALTERSKATEGORIE_FEIN',
                    'D19_LOTTO',
                    'HH_DELTA_FLAG',
                    'D19_TELKO_ONLINE_QUOTE_12',
                    'D19_GESAMT_ONLINE_QUOTE_12',
                    'D19_BANKEN_ONLINE_QUOTE_12',
                    'D19_SOZIALES',
                    'D19_VERSAND_ONLINE_QUOTE_12',
                    'D19_VERSI_ONLINE_QUOTE_12']

for column in cols_input_zero:
    azdias[column] = azdias[column].fillna(0)
    customers[column] = customers[column].fillna(0)


In [28]:
#columns we values correspond to unknown
azdias['MOBI_REGIO'] = azdias['MOBI_REGIO'].fillna(6)
customers['MOBI_REGIO'] = customers['MOBI_REGIO'].fillna(6)

azdias['D19_KONSUMTYP'] = azdias['D19_KONSUMTYP'].fillna(9)
customers['D19_KONSUMTYP'] = customers['D19_KONSUMTYP'].fillna(9)

azdias['D19_LETZTER_KAUF_BRANCHE'] = azdias['D19_LETZTER_KAUF_BRANCHE'].fillna('D19_UNBEKANNT')
customers['D19_LETZTER_KAUF_BRANCHE'] = customers['D19_LETZTER_KAUF_BRANCHE'].fillna('D19_UNBEKANNT')

In [29]:
missing_azdias = azdias.isnull().sum() / azdias.shape[0]
missing_azdias[missing_azdias > .1].sort_values(ascending=False)

VHN                           0.135989
PLZ8_BAUMAX                   0.130736
KBA13_ANTG1                   0.118714
KBA13_KMH_210                 0.118714
KBA13_HHZ                     0.118714
KBA13_GBZ                     0.118714
KBA13_CCM_1401_2500           0.118714
KBA13_BAUMAX                  0.118714
KBA13_ANTG4                   0.118714
KBA13_ANTG3                   0.118714
KBA13_ANTG2                   0.118714
KBA13_ANZAHL_PKW              0.118714
CAMEO_INTL_2015               0.111060
CAMEO_DEU_2015                0.111060
UMFELD_JUNG                   0.109721
UMFELD_ALT                    0.109721
GEMEINDETYP                   0.109147
STRUKTURTYP                   0.109147
VERDICHTUNGSRAUM              0.109147
ARBEIT                        0.109082
ANZ_HH_TITEL                  0.108848
KONSUMZELLE                   0.104525
GEBAEUDETYP_RASTER            0.104525
FIRMENDICHTE                  0.104525
MIN_GEBAEUDEJAHR              0.104517
MOBI_RASTER              

In [30]:
#Now that we have only 14% of rows with NaNs, we will drop those since we couldn't identify unknown values
#and it only represents a small portion of our dataset

#first create a copy of the df
azdias_clean = azdias.copy()
customers_clean = customers.copy()

#dropa NAs
azdias_clean.dropna(inplace=True)
customers_clean.dropna(inplace=True)

In [31]:
#original dataframe rows number
azdias.shape[0]

891221

In [32]:
#cleaned dataframe rows number
azdias_clean.shape[0]

737625

In [33]:
#checking azdias clean dataframe missing values
missing_azdias_clean = azdias_clean.isnull().sum() / azdias_clean.shape[0]
missing_azdias_clean[missing_azdias_clean > .1].sort_values(ascending=False)

Series([], dtype: float64)

In [34]:
#checking customers clean dataframe missing values
missing_customers_clean = customers_clean.isnull().sum() / customers_clean.shape[0]
missing_customers_clean[missing_customers_clean > .1].sort_values(ascending=False)

Series([], dtype: float64)

## 2.4 Feature Selection

In [35]:
azdias_clean.shape[1]

360

We have 360 attributes in total. For our model purpose that is too much. This can get in our way when we try to predict our most important features.

We will perform some cleaning regarding feature selection so we can have the most important attributes for our model.

In [36]:
# First let's get rid of categorical columns that are not relevant for our model
azdias_dropable_columns = azdias_clean.select_dtypes(include=['object']).columns

In [37]:
customers_dropable_columns = customers_clean.select_dtypes(include=['object']).columns

In [38]:
azdias_clean = azdias_clean.drop(columns=azdias_dropable_columns, axis=1)
customers_clean = customers_clean.drop(columns=customers_dropable_columns, axis=1)

In [70]:
customers_clean.shape

(129753, 356)

In [69]:
#Let's drop the LNR column as it works as an index
azdias_clean.drop(columns=['LNR'], axis=1, inplace=True)
customers_clean.drop(columns=['LNR'], axis=1, inplace=True)

## 2.5 Train and Test dataset

Now let's perform the same cleasing to the train and test dataset

In [45]:
df_train = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')
df_test = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [46]:
df_train.head()

Unnamed: 0,LNR,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,...,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,RESPONSE,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,1763,2,1.0,8.0,,,,,8.0,15.0,...,5.0,2.0,1.0,6.0,9.0,3.0,3,0,2,4
1,1771,1,4.0,13.0,,,,,13.0,1.0,...,1.0,2.0,1.0,4.0,9.0,7.0,1,0,2,3
2,1776,1,1.0,9.0,,,,,7.0,0.0,...,6.0,4.0,2.0,,9.0,2.0,3,0,1,4
3,1460,2,1.0,6.0,,,,,6.0,4.0,...,8.0,11.0,11.0,6.0,9.0,1.0,3,0,2,4
4,1783,2,1.0,9.0,,,,,9.0,53.0,...,2.0,2.0,1.0,6.0,9.0,3.0,3,0,1,3


In [48]:
#replacing X in the "CAMEO_DEUG_2015" column
df_train['CAMEO_DEUG_2015'] = df_train['CAMEO_DEUG_2015'].replace('X', -1).astype(float)
df_test['CAMEO_DEUG_2015'] = df_test['CAMEO_DEUG_2015'].replace('X', -1).astype(float) 

#replacing XX in the "CAMEO_DEUG_2015" column
df_train['CAMEO_INTL_2015'] = df_train['CAMEO_INTL_2015'].replace('XX', -1).astype(float)
df_test['CAMEO_INTL_2015'] = df_test['CAMEO_INTL_2015'].replace('XX', -1).astype(float) 

In [51]:
#Chaning NaN for unknown
df_train = unknown_replace_nans(df_train, df_att)
df_test = unknown_replace_nans(df_test, df_att)

In [56]:
#checking missing values with 40%+ from azdias
missing_df_train = df_train.isnull().sum() / df_train.shape[0]
missing_df_train = missing_df_train[missing_df_train > .4].sort_values(ascending=False).index.tolist()

#Let's drop these 40%+ columns
df_train = df_train.drop(columns=missing_df_train)

In [57]:
#checking missing values with 40%+ from azdias
missing_df_test = df_test.isnull().sum() / df_test.shape[0]
missing_df_test = missing_df_test[missing_df_test > .4].sort_values(ascending=False).index.tolist()

#Let's drop these 40%+ columns
df_test = df_test.drop(columns=missing_df_test)

In [59]:
#Let's input 0 where it means unknown
cols_input_zero = ['ALTERSKATEGORIE_FEIN',
                    'D19_LOTTO',
                    'HH_DELTA_FLAG',
                    'D19_TELKO_ONLINE_QUOTE_12',
                    'D19_GESAMT_ONLINE_QUOTE_12',
                    'D19_BANKEN_ONLINE_QUOTE_12',
                    'D19_SOZIALES',
                    'D19_VERSAND_ONLINE_QUOTE_12',
                    'D19_VERSI_ONLINE_QUOTE_12']

for column in cols_input_zero:
    df_train[column] = df_train[column].fillna(0)
    df_test[column] = df_test[column].fillna(0)


In [60]:
#columns we values correspond to unknown
df_train['MOBI_REGIO'] = df_train['MOBI_REGIO'].fillna(6)
df_test['MOBI_REGIO'] = df_test['MOBI_REGIO'].fillna(6)

df_train['D19_KONSUMTYP'] = df_train['D19_KONSUMTYP'].fillna(9)
df_test['D19_KONSUMTYP'] = df_test['D19_KONSUMTYP'].fillna(9)

df_train['D19_LETZTER_KAUF_BRANCHE'] = df_train['D19_LETZTER_KAUF_BRANCHE'].fillna('D19_UNBEKANNT')
df_test['D19_LETZTER_KAUF_BRANCHE'] = df_test['D19_LETZTER_KAUF_BRANCHE'].fillna('D19_UNBEKANNT')

In [61]:
#Now that we have only 14% of rows with NaNs, we will drop those since we couldn't identify unknown values
#and it only represents a small portion of our dataset

#first create a copy of the df
df_train_clean = df_train.copy()
df_test_clean = df_test.copy()

#dropa NAs
df_train_clean.dropna(inplace=True)
df_test_clean.dropna(inplace=True)

In [63]:
# First let's get rid of categorical columns that are not relevant for our model
df_train_dropable_columns = df_train_clean.select_dtypes(include=['object']).columns
df_test_dropable_columns = df_test_clean.select_dtypes(include=['object']).columns

df_train_clean = df_train_clean.drop(columns=df_train_dropable_columns, axis=1)
df_test_clean = df_test_clean.drop(columns=df_test_dropable_columns, axis=1)

# 3.0 Export Cleaned Data

In [64]:
azdias_clean.to_csv('data/clean_AZDIAS.csv')
customers_clean.to_csv('data/clean_CUSTOMERS.csv')
df_train_clean.to_csv('data/clean_TRAIN.csv')
df_test_clean.to_csv('data/clean_TEST.csv')