In [16]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [17]:
def read_attributes_info():
    """This function reads and adjust the file attributes_info. 
    This file is used to replace the code of missing values for all the columns"""
    # Auxiliar function for casting column values
    def cast_values(value):
        if type(value) == str:
            if value[0] == '[':
                return [int(x) for x in value[1:-1].split(',')]
            else:
                return int(value)
        else:
            return value
        
    attributes_inf = pd.read_csv('data/attributes_info.csv', index_col = 0) 

    # Casting values
    attributes_inf['Missing values code'] = attributes_inf['Missing values code'].apply(cast_values)

    return attributes_inf

In [23]:
# Main cleaning function

def clean_data(dataframe):
    """
    This is the main cleaning function. It receives as input a dataframe such as azdias, customers or MAILOUT's
    and performs all the cleaning steps, mainly those related with missing values (we impute or drop columns or rows)
    """
    ########### Initial Steps ####################################
    # Remove first column ('Unnamed: 0')
    dataframe.drop(dataframe.columns[0], axis = 1, inplace = True)
    
    # Fixing the mixed type warnings
    dataframe.iloc[:,18] = dataframe.iloc[:,18].replace({'X': np.nan})
    dataframe.iloc[:,19] = dataframe.iloc[:,19].replace({'XX': np.nan})
    
    # Casting
    dataframe.iloc[:, 18] = dataframe.iloc[:, 18].map(float)
    dataframe.iloc[:, 19] = dataframe.iloc[:, 18].map(float)
    
    ########### Missing values  ##############################
    missing_1 = dataframe.isnull().sum().sum()
    print('Initial amount of missing values:', missing_1)
    
    print('\nReading the description of attributes table....')
    attributes_info = read_attributes_info()
    
    # Replacing code for missing values using attributes_info
    for att in attributes_info.index:
        code = attributes_info.loc[att, 'Missing values code']
        if type(code) in [int, list]:
            try:   # try because some attributes do not appear in the actual dataframes...
                dataframe[att] = dataframe[att].replace(code, np.nan)
            except:
                continue

    print('\nMissing values after including missing codes', dataframe.isnull().sum().sum())
    print('Additional missing values:', dataframe.isnull().sum().sum() - missing_1)
    
    return dataframe

### Reading file

In [24]:
mailout_train = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Try cleaning it

In [26]:
clean = clean_data(mailout_train)

clean

Initial amount of missing values: 2361705

Reading the description of attributes table....

Missing values after including missing codes 2361705
Additional missing values: 0


Unnamed: 0,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,...,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,RESPONSE,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,2.0,1.0,8.0,,,,,8.0,15.0,0.0,...,5.0,2.0,1.0,6.0,9.0,3.0,3,0,2,4
1,1.0,4.0,13.0,,,,,13.0,1.0,0.0,...,1.0,2.0,1.0,4.0,9.0,7.0,1,0,2,3
2,1.0,1.0,9.0,,,,,7.0,0.0,,...,6.0,4.0,2.0,,9.0,2.0,3,0,1,4
3,2.0,1.0,6.0,,,,,6.0,4.0,0.0,...,8.0,11.0,11.0,6.0,9.0,1.0,3,0,2,4
4,2.0,1.0,9.0,,,,,9.0,53.0,0.0,...,2.0,2.0,1.0,6.0,9.0,3.0,3,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42957,2.0,1.0,,,,,,10.0,1.0,0.0,...,1.0,1.0,1.0,4.0,8.0,7.0,1,0,1,4
42958,,1.0,,,,,,14.0,1.0,0.0,...,1.0,1.0,1.0,5.0,9.0,7.0,1,0,1,3
42959,1.0,1.0,16.0,,,,,10.0,2.0,0.0,...,1.0,2.0,1.0,2.0,9.0,7.0,1,0,1,4
42960,2.0,1.0,18.0,,,,,13.0,3.0,0.0,...,2.0,3.0,4.0,2.0,9.0,2.0,3,0,2,4
