# 분류 - 고객 성별 예측

#### 데이터 설명

|칼럼영|칼럼 설명|
|---|---|
|회원ID|회원고유번호|
|총구매액|총 구매금액(원)|
|최대구매액|구매건당 최대구매금액(원)|
|환불금액|환불금액(원)|
|주구매상품|주로 구매한 품목(42개 품목 범주)|
|주구매지점|주로 구매한 저점명(24개 저점 범부)|
|방문일수|고객이 방문한 일수(일)|
|방문당구매건수|총구매건수/방문일수|
|주말방문비율|주말에 방문한 비율|
|구매주기|구매 주기(일)|
|성별|고객 성별(0:여자, 1:남자)|

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## 데이터 읽기

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/customer.csv')
df

In [None]:
# Visual Python: Data Analysis > Data Info
df.info()

In [None]:
# Visual Python: Data Analysis > Data Info
df.describe()

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

#### Target 컬럼 확인

In [None]:
# Visual Python: Data Analysis > Data Info
df['성별'].value_counts()

#### 결측치 처리

In [None]:
# Visual Python: Data Analysis > Frame
df.drop(['회원ID'], axis=1, inplace=True)
df['환불금액'] = df['환불금액'].fillna(df['환불금액'].median())
df

#### 범주형 변수: Label Encoding

In [None]:
# Visual Python: Data Analysis > Frame
df['주구매상품_label'] = pd.Categorical(df['주구매상품']).codes
df['주구매지점_label'] = pd.Categorical(df['주구매지점']).codes
df.drop(['주구매상품','주구매지점'], axis=1, inplace=True)
df

#### 수치형 변수: Min-Max Scaling

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# [2] Fit
scaler.fit(df[['총구매액', '최대구매액', '환불금액', '방문일수', '방문당구매건수', '주말방문비율', '구매주기']])

# [3] Transform
trans = scaler.transform(df[['총구매액', '최대구매액', '환불금액', '방문일수', '방문당구매건수', '주말방문비율', '구매주기']])

In [None]:
cols = ['총구매액','최대구매액','환불금액','방문일수','방문당구매건수','주말방문비율','구매주기']

df[cols] = trans

#### 성별 예측

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['총구매액', '최대구매액', '환불금액', '방문일수', '방문당구매건수', '주말방문비율', '구매주기', '주구매상품_label', '주구매지점_label']], df['성별'])

# [2] Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

In [None]:
# Visual Python: Visualization > Chart Style
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rc('figure', figsize=(8, 6))

from matplotlib import rcParams
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 10
rcParams['axes.unicode_minus'] = False

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True)

#### [참고] 확률값 예측하기

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred_prob = model.predict_proba(X_test)
pred_prob

---

In [None]:
# End of file