In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats

import missingno as msno
plt.style.use('seaborn')

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df_train = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
df_test = pd.read_csv("../input/santander-customer-transaction-prediction/test.csv")

In [3]:
pd.set_option('display.max_columns', None) # 모든 열 출력
df_train.head(3)

In [4]:
df_test.head(3)

In [5]:
df_train.isnull().sum().sum(), df_test.isnull().sum().sum()

In [6]:
df_train.target.describe()

In [7]:
sns.countplot(df_train.target)

In [8]:
df_train.target.value_counts()

In [9]:
df_train.loc[df_train.target == 0].shape[0] / df_train.shape[0]

In [10]:
df_train.shape, df_test.shape

In [11]:
df_train.describe()

In [12]:
df_test.describe()

- 결측치는 없다.
- target=0이 압도적으로 많다. (약 90%)
- train셋과 test셋의 분포는 거의 비슷하다.

In [13]:
fig, ax = plt.subplots(2, 2)
sns.scatterplot(x=df_train["var_0"], y=df_test["var_0"], ax=ax[0][0])
sns.scatterplot(x=df_train["var_1"], y=df_test["var_1"], ax=ax[0][1])
sns.scatterplot(x=df_train["var_2"], y=df_test["var_2"], ax=ax[1][0])
sns.scatterplot(x=df_train["var_3"], y=df_test["var_3"], ax=ax[1][1])

plt.show()

- 수많은 var 변수(200개) 중 target과 연관이 높은 변수들을 찾아야 할 것
    - 근데 너무 많다!

In [14]:
train_corr = df_train.drop({"ID_code", "target"}, axis=1).corr()
train_corr = train_corr[train_corr != 1] # 상관관계 1이 아닌 것만

test_corr = df_test.drop("ID_code", axis=1).corr()
test_corr = test_corr[test_corr != 1]

plt.figure(figsize=(12, 5))
sns.distplot(train_corr, label="train set", color="red")
sns.distplot(test_corr, label="test set", color="blue")

plt.legend()

- 여기서 서로 서로 상관관계가 높은 변수들은 삭제해야하지 않을까 , ,,, , ,,

In [15]:
train_corr = df_train.drop({"ID_code", "target"}, axis=1).corr()
train_corr = train_corr[train_corr >= 0.005]

colormap = plt.cm.PuBu

fig, ax = plt.subplots(figsize=(18, 16))
plt.title("Corrleation of Numeric Features with SalePrice", y=1, size=18)
sns.heatmap(train_corr,
            vmax=.8, # 컬러바 최대 크기
            linewidths=0.1, # 경계면 실선으로 구분하기
            square=True,
            annot=False, # 실제값 표시 x
            cmap=colormap, 
            linecolor="white",
            annot_kws={'size': 14})

- 서로 상관계수가 높은 변수는 없다.

In [16]:
features_1 = df_train.columns.drop({"ID_code", "target"})[:40]
features_2 = df_train.columns.drop({"ID_code", "target"})[40:80]
features_3 = df_train.columns.drop({"ID_code", "target"})[80:120]
features_4 = df_train.columns.drop({"ID_code", "target"})[120:160]
features_5 = df_train.columns.drop({"ID_code", "target"})[160:200]


f, ax = plt.subplots(5, figsize=(12, 20))
df_train.corr()["target"][features_1].plot.bar(ax=ax[0])
df_train.corr()["target"][features_2].plot.bar(ax=ax[1])
df_train.corr()["target"][features_3].plot.bar(ax=ax[2])
df_train.corr()["target"][features_4].plot.bar(ax=ax[3])
df_train.corr()["target"][features_5].plot.bar(ax=ax[4])

plt.show()

In [17]:
features = df_train.columns.drop({"ID_code", "target"})

correlation = df_train.corr()["target"][features].abs()
correlation = correlation.sort_values()

correlation[-15:]