In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [17]:
azdias = pd.read_csv('data/Udacity_AZDIAS_052018.csv', sep=';')
customers = pd.read_csv('data/Udacity_CUSTOMERS_052018.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


- #### Pandas deliver a warning message, so need to check datatype of each column and find mixed types.

In [12]:
azdias.dtypes[azdias.dtypes==object]

CAMEO_DEU_2015              object
CAMEO_DEUG_2015             object
CAMEO_INTL_2015             object
D19_LETZTER_KAUF_BRANCHE    object
EINGEFUEGT_AM               object
OST_WEST_KZ                 object
dtype: object

In [13]:
# Map categorical value(1A,1B,2A, etc.) to numerical value
CAMEO_DEU_2015_value = azdias.groupby(['CAMEO_DEU_2015']).count()['LNR'].index
CAMEO_DEU_2015_value_map = {}
for i, item in enumerate(CAMEO_DEU_2015_value):
    # Conside XX as missing value, fill with 0
    if item== 'XX':
        CAMEO_DEU_2015_value_map[item] = 0
    else:
        CAMEO_DEU_2015_value_map[item] = i + 1        

In [14]:
azdias['CAMEO_DEU_2015'] = azdias['CAMEO_DEU_2015'].apply(lambda x: CAMEO_DEU_2015_value_map.get(x))
azdias['CAMEO_DEU_2015'].fillna(0, inplace=True)

In [15]:
# Conside X as missing value, fill with 0
azdias['CAMEO_DEUG_2015'] = np.where(azdias['CAMEO_DEUG_2015']=='X', 0, azdias['CAMEO_DEUG_2015'])
azdias['CAMEO_DEUG_2015'] = azdias['CAMEO_DEUG_2015'].astype('float')
azdias['CAMEO_DEUG_2015'].fillna(0, inplace=True)

In [16]:
# Conside XX as missing value, fill with 0
azdias['CAMEO_INTL_2015'] = np.where(azdias['CAMEO_INTL_2015']=='XX', 0, azdias['CAMEO_INTL_2015'])
azdias['CAMEO_INTL_2015'] = azdias['CAMEO_INTL_2015'].astype('float')
azdias['CAMEO_INTL_2015'].fillna(0, inplace=True)

In [17]:
# Already has encoded columns contain branch info.
azdias.drop(['D19_LETZTER_KAUF_BRANCHE'], axis=1, inplace=True)

In [19]:
# Only keep year
azdias['EINGEFUEGT_AM'] = pd.to_datetime(azdias['EINGEFUEGT_AM']).dt.year
azdias['EINGEFUEGT_AM'].fillna(azdias['EINGEFUEGT_AM'].mode()[0], inplace=True)

In [20]:
# Map O and W to 1 and 2, fill Nan with 0
azdias['OST_WEST_KZ'] = np.where(
    azdias['OST_WEST_KZ']=='O',
    1,
    np.where(azdias['OST_WEST_KZ']=='W', 2, 0)
)

In [21]:
azdias.dtypes[azdias.dtypes==object]

Series([], dtype: object)

- #### Checking missing percentage and group column with same missing rate.

In [25]:
azdias_missing_percent = (azdias.isna().sum(axis=0)/azdias.shape[0])

In [26]:
azdias_missing_percent = azdias_missing_percent.rename('missing_percent').reset_index()

In [27]:
azdias_missing_percent = azdias_missing_percent.groupby('missing_percent')['index'] \
                          .apply(lambda x: list(x)) \
                          .reset_index() \
                          .sort_values(['missing_percent'], ascending=False).reset_index(drop=True)

In [28]:
azdias_missing_percent['attribute_cnt'] = azdias_missing_percent['index'].apply(lambda x:len(x))

In [29]:
azdias_missing_percent

Unnamed: 0,missing_percent,index,attribute_cnt
0,0.998648,[ALTER_KIND4],1
1,0.993077,[ALTER_KIND3],1
2,0.9669,[ALTER_KIND2],1
3,0.909048,[ALTER_KIND1],1
4,0.733996,[EXTSEL992],1
5,0.655967,[KK_KUNDENTYP],1
6,0.295041,[ALTERSKATEGORIE_FEIN],1
7,0.288495,"[D19_BANKEN_ONLINE_QUOTE_12, D19_GESAMT_ONLINE...",8
8,0.149597,"[KBA05_ALTER1, KBA05_ALTER2, KBA05_ALTER3, KBA...",64
9,0.135989,"[KKK, REGIOTYP, VHN]",3


- #### Kids' age has too many missing value, so combine these four variable to create kid number variable.

In [40]:
kind_col = ['ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3', 'ALTER_KIND4']
for col in kind_col:
    azdias[col] = np.where(azdias[col]>0, 1, 0)
    
azdias['ALTER_KIND'] = (azdias['ALTER_KIND1'] + azdias['ALTER_KIND2']
                        + azdias['ALTER_KIND3'] + azdias['ALTER_KIND4'])
azdias.drop(kind_col, axis=1, inplace = True)  

In [41]:
azdias.groupby(['ALTER_KIND']).count()['LNR']   

ALTER_KIND
0    810163
1     51559
2     23329
3      4965
4      1205
Name: LNR, dtype: int64

- #### Drop columns contain more than 50% missing value.

In [50]:
azdias.drop(['EXTSEL992', 'KK_KUNDENTYP'], axis=1, inplace = True)  

- #### Fillna with 0.

In [51]:
azdias.fillna(0, inplace=True)

- #### Check missing value.

In [54]:
azdias_missing_percent = (azdias.isna().sum(axis=0)/azdias.shape[0])

In [55]:
azdias_missing_percent[azdias_missing_percent>0]

Series([], dtype: float64)

- #### Pack the wraggling method above to function, apply to customer data.

In [18]:
def clean_data(df_raw):
    
    df = df_raw.copy()
    # Map categorical value(1A,1B,2A, etc.) to numerical value
    CAMEO_DEU_2015_value = df.groupby(['CAMEO_DEU_2015']).count()['LNR'].index
    CAMEO_DEU_2015_value_map = {}
    for i, item in enumerate(CAMEO_DEU_2015_value):
        # Conside XX as missing value, fill with 0
        if item== 'XX':
            CAMEO_DEU_2015_value_map[item] = 0
        else:
            CAMEO_DEU_2015_value_map[item] = i + 1 
            
    df['CAMEO_DEU_2015'] = df['CAMEO_DEU_2015'].apply(lambda x: CAMEO_DEU_2015_value_map.get(x))
    df['CAMEO_DEU_2015'].fillna(0, inplace=True)
    
    # Conside X as missing value, fill with 0
    df['CAMEO_DEUG_2015'] = np.where(df['CAMEO_DEUG_2015']=='X', 0, df['CAMEO_DEUG_2015'])
    df['CAMEO_DEUG_2015'] = df['CAMEO_DEUG_2015'].astype('float')
    df['CAMEO_DEUG_2015'].fillna(0, inplace=True)
    
    # Conside XX as missing value, fill with 0
    df['CAMEO_INTL_2015'] = np.where(df['CAMEO_INTL_2015']=='XX', 0, df['CAMEO_INTL_2015'])
    df['CAMEO_INTL_2015'] = df['CAMEO_INTL_2015'].astype('float')
    df['CAMEO_INTL_2015'].fillna(0, inplace=True)
    
    # Already has encoded columns contain branch info.
    df.drop(['D19_LETZTER_KAUF_BRANCHE'], axis=1, inplace=True)
    
    # Only keep year
    df['EINGEFUEGT_AM'] = pd.to_datetime(df['EINGEFUEGT_AM']).dt.year
    df['EINGEFUEGT_AM'].fillna(df['EINGEFUEGT_AM'].mode()[0], inplace=True)
    
    # Map O and W to 1 and 2, fill Nan with 0
    df['OST_WEST_KZ'] = np.where(
        df['OST_WEST_KZ']=='O',
        1,
        np.where(df['OST_WEST_KZ']=='W', 2, 0)
    )
    
    kind_col = ['ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3', 'ALTER_KIND4']
    for col in kind_col:
        df[col] = np.where(df[col]>0, 1, 0)

    df['ALTER_KIND'] = (df['ALTER_KIND1'] + df['ALTER_KIND2']
                            + df['ALTER_KIND3'] + df['ALTER_KIND4'])
    df.drop(kind_col, axis=1, inplace = True)  
    
    df.drop(['EXTSEL992', 'KK_KUNDENTYP'], axis=1, inplace = True)
    
    df.fillna(0, inplace=True)
    
    df_missing = (df.isna().sum()).sum()
    
    print(f'Dataset has {df_missing} missing value.')
    
    return df
    

In [19]:
azdias_clean = clean_data(azdias)

Dataset has 0 missing value.


In [34]:
customers_clean = clean_data(customers)
customers_clean_nolabel = customers_clean.drop(['CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'], axis=1)

  result = method(y)


KeyError: "['D19_LETZTER_KAUF_BRANCHE'] not found in axis"

In [22]:
azdias_clean.shape

(891221, 360)

In [23]:
customers_clean.shape

(191652, 363)

In [30]:
set(customers_clean.columns).difference(set(azdias_clean.columns))

{'CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'}

In [33]:
customers_clean.groupby('PRODUCT_GROUP').count()['LNR']

PRODUCT_GROUP
COSMETIC              43410
COSMETIC_AND_FOOD    100860
FOOD                  47382
Name: LNR, dtype: int64

In [None]:
azdias_clean['y'] = 0
customers_clean['y'] = 1
df = azdias_clean.append(customers_clean, ignore_index=True)
X = df[col for df.columns if col not in ['LNR', 'y' ]]
y = df['y']

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [10]:
ratio_list = []
for i in range(azdias_norm.shape[1]):
    pca = PCA(n_components=i+1)
    pca.fit(azdias_norm)
    explained_variance_ratio_sum = pca.explained_variance_ratio_.sum()
    ratio_list.append(explained_variance_ratio_sum)
    print(f'{i+1} components explains {explained_variance_ratio_sum} of variance.')

In [8]:
scaler = StandardScaler()

In [9]:
azdias_norm = scaler.fit_transform(azdias_clean.iloc[:, 1:])

In [15]:
ratio_list = []
for i in range(azdias_norm.shape[1]):
    pca = PCA(n_components=i+1)
    pca.fit(azdias_norm)
    explained_variance_ratio_sum = pca.explained_variance_ratio_.sum()
    ratio_list.append(explained_variance_ratio_sum)
    print(f'{i+1} components explains {explained_variance_ratio_sum} of variance.')

1 components explains 0.24423062854961294 of variance.
2 components explains 0.31428092304440614 of variance.
3 components explains 0.3674403054423734 of variance.
4 components explains 0.4097256170954655 of variance.
5 components explains 0.439771267072605 of variance.
6 components explains 0.466133715722295 of variance.
7 components explains 0.4841774051708582 of variance.
8 components explains 0.4968959659564498 of variance.
9 components explains 0.5090809512172483 of variance.
10 components explains 0.5197843529279067 of variance.
11 components explains 0.5295972614531287 of variance.
12 components explains 0.5392186255925487 of variance.
13 components explains 0.5479182564123347 of variance.
14 components explains 0.5565068713760396 of variance.
15 components explains 0.5645688857429721 of variance.
16 components explains 0.5719100467268919 of variance.
17 components explains 0.578910104127436 of variance.
18 components explains 0.5855233993544722 of variance.
19 components explai

KeyboardInterrupt: 

In [None]:
file = open("pca_explain_variance.clk", "wb")
pickle.dump(ratio_list, file)
file.close()

In [None]:
mailout_train = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')
mailout_test = pd.read_csv('data/Udacity_MAILOUT_052018_TEST.csv', sep=';')