In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

In [2]:
azdias = pd.read_csv('data/Udacity_AZDIAS_052018.csv', sep=';')
customers = pd.read_csv('data/Udacity_CUSTOMERS_052018.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


- #### Pandas deliver a warning message, so need to check datatype of each column and find mixed types.

In [12]:
azdias.dtypes[azdias.dtypes==object]

CAMEO_DEU_2015              object
CAMEO_DEUG_2015             object
CAMEO_INTL_2015             object
D19_LETZTER_KAUF_BRANCHE    object
EINGEFUEGT_AM               object
OST_WEST_KZ                 object
dtype: object

In [13]:
# Map categorical value(1A,1B,2A, etc.) to numerical value
CAMEO_DEU_2015_value = azdias.groupby(['CAMEO_DEU_2015']).count()['LNR'].index
CAMEO_DEU_2015_value_map = {}
for i, item in enumerate(CAMEO_DEU_2015_value):
    # Conside XX as missing value, fill with 0
    if item== 'XX':
        CAMEO_DEU_2015_value_map[item] = 0
    else:
        CAMEO_DEU_2015_value_map[item] = i + 1        

In [14]:
azdias['CAMEO_DEU_2015'] = azdias['CAMEO_DEU_2015'].apply(lambda x: CAMEO_DEU_2015_value_map.get(x))
azdias['CAMEO_DEU_2015'].fillna(0, inplace=True)

In [15]:
# Conside X as missing value, fill with 0
azdias['CAMEO_DEUG_2015'] = np.where(azdias['CAMEO_DEUG_2015']=='X', 0, azdias['CAMEO_DEUG_2015'])
azdias['CAMEO_DEUG_2015'] = azdias['CAMEO_DEUG_2015'].astype('float')
azdias['CAMEO_DEUG_2015'].fillna(0, inplace=True)

In [16]:
# Conside XX as missing value, fill with 0
azdias['CAMEO_INTL_2015'] = np.where(azdias['CAMEO_INTL_2015']=='XX', 0, azdias['CAMEO_INTL_2015'])
azdias['CAMEO_INTL_2015'] = azdias['CAMEO_INTL_2015'].astype('float')
azdias['CAMEO_INTL_2015'].fillna(0, inplace=True)

In [17]:
# Already has encoded columns contain branch info.
azdias.drop(['D19_LETZTER_KAUF_BRANCHE'], axis=1, inplace=True)

In [19]:
# Only keep year
azdias['EINGEFUEGT_AM'] = pd.to_datetime(azdias['EINGEFUEGT_AM']).dt.year
azdias['EINGEFUEGT_AM'].fillna(azdias['EINGEFUEGT_AM'].mode()[0], inplace=True)

In [20]:
# Map O and W to 1 and 2, fill Nan with 0
azdias['OST_WEST_KZ'] = np.where(
    azdias['OST_WEST_KZ']=='O',
    1,
    np.where(azdias['OST_WEST_KZ']=='W', 2, 0)
)

In [21]:
azdias.dtypes[azdias.dtypes==object]

Series([], dtype: object)

- #### Checking missing percentage and group column with same missing rate.

In [25]:
azdias_missing_percent = (azdias.isna().sum(axis=0)/azdias.shape[0])

In [26]:
azdias_missing_percent = azdias_missing_percent.rename('missing_percent').reset_index()

In [27]:
azdias_missing_percent = azdias_missing_percent.groupby('missing_percent')['index'] \
                          .apply(lambda x: list(x)) \
                          .reset_index() \
                          .sort_values(['missing_percent'], ascending=False).reset_index(drop=True)

In [28]:
azdias_missing_percent['attribute_cnt'] = azdias_missing_percent['index'].apply(lambda x:len(x))

In [29]:
azdias_missing_percent

Unnamed: 0,missing_percent,index,attribute_cnt
0,0.998648,[ALTER_KIND4],1
1,0.993077,[ALTER_KIND3],1
2,0.9669,[ALTER_KIND2],1
3,0.909048,[ALTER_KIND1],1
4,0.733996,[EXTSEL992],1
5,0.655967,[KK_KUNDENTYP],1
6,0.295041,[ALTERSKATEGORIE_FEIN],1
7,0.288495,"[D19_BANKEN_ONLINE_QUOTE_12, D19_GESAMT_ONLINE...",8
8,0.149597,"[KBA05_ALTER1, KBA05_ALTER2, KBA05_ALTER3, KBA...",64
9,0.135989,"[KKK, REGIOTYP, VHN]",3


- #### Kids' age has too many missing value, so combine these four variable to create kid number variable.

In [40]:
kind_col = ['ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3', 'ALTER_KIND4']
for col in kind_col:
    azdias[col] = np.where(azdias[col]>0, 1, 0)
    
azdias['ALTER_KIND'] = (azdias['ALTER_KIND1'] + azdias['ALTER_KIND2']
                        + azdias['ALTER_KIND3'] + azdias['ALTER_KIND4'])
azdias.drop(kind_col, axis=1, inplace = True)  

In [41]:
azdias.groupby(['ALTER_KIND']).count()['LNR']   

ALTER_KIND
0    810163
1     51559
2     23329
3      4965
4      1205
Name: LNR, dtype: int64

- #### Drop columns contain more than 50% missing value.

In [50]:
azdias.drop(['EXTSEL992', 'KK_KUNDENTYP'], axis=1, inplace = True)  

- #### Fillna with 0.

In [51]:
azdias.fillna(0, inplace=True)

- #### Check missing value.

In [54]:
azdias_missing_percent = (azdias.isna().sum(axis=0)/azdias.shape[0])

In [55]:
azdias_missing_percent[azdias_missing_percent>0]

Series([], dtype: float64)