# Description

This notebook applies PCA on merged tables.

In [1]:
import numpy as np
import pandas as pd
import feather

import os
import warnings
warnings.filterwarnings('ignore')

import gc

In [2]:
# Directly load merged dataframe
train = pd.read_csv('../input/home-credit-merged/train.csv')
test = pd.read_csv('../input/home-credit-merged/test.csv')

train_ids = train['SK_ID_CURR']
test_ids = test['SK_ID_CURR']

train_labels = train['TARGET']

In [3]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Make sure to drop the ids and target
train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test = test.drop(columns = ['SK_ID_CURR'])

features = list(train.columns)

for feat in features:
    # Median imputation of missing values
    imputer = SimpleImputer(strategy = 'median')

    # Fit on the training data
    imputer.fit(train[feat].values.reshape(-1, 1))

    # Transform both training and testing data
    train[feat] = imputer.transform(train[feat].values.reshape(-1, 1))
    test[feat] = imputer.transform(test[feat].values.reshape(-1, 1))

pca = PCA()

# Fit and transform on the training data
train_pca = pca.fit_transform(train)

# transform the testing data
test_pca = pca.transform(test)

In [4]:
train_pca_df = pd.DataFrame({'pc_{}'.format(i): train_pca[:, i] for i in range(train_pca.shape[1])})
train_pca_df['TARGET'] = train_labels
train_pca_df['SK_ID_CURR'] = train_ids
train_pca_df.head()

In [5]:
test_pca_df = pd.DataFrame({'pc_{}'.format(i): test_pca[:, i] for i in range(test_pca.shape[1])})
test_pca_df['SK_ID_CURR'] = test_ids
test_pca_df.head()

In [6]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [7]:
train_pca_df = convert_types(train_pca_df, print_info=True)
test_pca_df = convert_types(test_pca_df, print_info=True)

In [8]:
feather.write_dataframe(train_pca_df, 'train.feather')
feather.write_dataframe(test_pca_df, 'test.feather')

In [9]:
#train_pca_df.to_csv('train.csv', index=False)
#test_pca_df.to_csv('test.csv', index=False)

In [10]:
pca.explained_variance_ratio_

In [14]:
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22

plt.figure(figsize = (10, 8))
plt.plot(list(range(train.shape[1])), np.cumsum(pca.explained_variance_ratio_), 'r-')
plt.xlabel('Number of PC'); plt.ylabel('Cumulative Explained Variance')
#plt.title('Cumulative Variance Explained with PCA')

In [12]:
plt.figure(figsize=(10, 8))

xpc = 'pc_0'
ypc = 'pc_1'

plt.scatter(train_pca_df[xpc].iloc[(train_labels == 0).to_list()], train_pca_df[ypc].iloc[(train_labels == 0).to_list()], alpha=0.8, label='target=0')
plt.scatter(train_pca_df[xpc].iloc[(train_labels == 1).to_list()], train_pca_df[ypc].iloc[(train_labels == 1).to_list()], alpha=0.8, label='target=1')
plt.xlabel(xpc)
plt.ylabel(ypc)
plt.show()

In [13]:
cum = np.cumsum(pca.explained_variance_ratio_)
print('number of PC to achieve 99% explained variance:', 1 + np.min(np.where(cum > 0.99)))
print('number of PC to achieve 99.9% explained variance:', 1 + np.min(np.where(cum > 0.999)))
print('number of PC to achieve 99.99% explained variance:', 1 + np.min(np.where(cum > 0.9999)))
print('number of PC to achieve 99.999% explained variance:', 1 + np.min(np.where(cum > 0.99999)))