# Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False
import seaborn as sns
import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier            # Voting 기법 사용 Module
from sklearn.neighbors import KNeighborsClassifier 
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


# Data

In [None]:
df = pd.read_csv('')

In [None]:
# value count & unique & Null Count
df.info()
print()
print(df.isnull().sum() / len(df) * 100)
print()

for i in df.columns:
    print('\33[103m', i, '\33[0m')
    print(df[i].value_counts())
    uu = str(df[i].unique())
    print(df[i].dtype)
    print('\33[91m' + uu +  '\033[0m')
    print('\33[94m' + 'Null Count :' + '\033[0m', df[i].isnull().sum())
    print('🌈')
    print()

In [None]:
# object, float type 구분 
type_lst = dict({'Object': [], 'Float64' : []})
o_lst = []
f_lst = []

for i in dropout:
    if dropout[i].dtype == 'O':
        o_lst.append(i)
    else:
        f_lst.append(i)

type_lst['Object'] = o_lst
type_lst['Float64'] = f_lst

# Visualization

In [None]:
# categorical plotly histogram

cnt = -1

for col in df_obj.columns:
    cnt += 1
    tt = ['title_names']
    fig = px.histogram(df, x=col, text_auto=True, color_discrete_sequence=['chocolate'], width=800, height=400)
    fig.update_layout(bargap=0.2, title=tt[cnt])
    fig.show()

#####

# categorical histogram

for col in df[['Sex', 'Ticket', 'Cabin', 'Embarked']].columns:
    fig = sns.histplot(df, x=col)
    plt.show()

In [None]:
# numerical plotly boxplot & describe

cnt = -1

for col in df_num.columns:
    cnt += 1
    tt = ['title_names']
    fig = px.box(df, x=col, color_discrete_sequence=['maroon'], width=800, height=400)
    fig.update_layout(bargap=0.2, title=tt[cnt])
    fig.show()
    print(df[a].describe())
    print()
    print('👻')

In [None]:
# heatmap
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

# Preprocessing

In [None]:
# 결측치 비율별로 넣기 
import random

ratio_fill = df.replace('X', np.NaN)   # To NaN

nan_value = list(ratio_fill[ratio_fill[''].isnull()].index)                    # NaN 값 index 
select_nan_value = random.sample(nan_value, k=int(len(nan_value)*0.6))           # NaN 값의 60% 선택

df.loc[fill_nan_value][['column_name']] = df.loc[fill_nan_value][['column_name']].fillna('M') 
df['column_name'] = df['column_name'].fillna('F')

# Encoding

In [10]:
sample=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# Label Encoding
encoder = LabelEncoder()
labels = encoder.fit_transform(sample)

# One-Hot Encoding
# 2차원 ndarray로 변환 후 적용
items = np.array(sample).reshape(-1, 1)
oh_encoder = OneHotEncoder()
oh_encoder.fit(sample)
oh_labels = oh_encoder.transform(sample)
# OneHotEncoder로 변환한 결과는 희소행렬(Sparse Matrix)이므로 toarray()를 이용하여 밀집 행렬(Dense Matrix)로 변환.
oh_labels.toarray()

# get_dummies
df = pd.DataFrame(sample)
pd.get_dummies(df, columns=[col])

Unnamed: 0,0_TV,0_냉장고,0_믹서,0_선풍기,0_전자렌지,0_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


# Train Test Split

In [None]:
X = 
y = 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=777)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

# Feature Scaling

In [24]:
for col in X_train[num].columns:
   # Standardization _ Standard
   st_scaler = StandardScaler()
   st_scaled = st_scaler.fit_transform(X_train[[col]])
   X_train[[col]] = st_scaled

   # Normalization _ MinMax
   mm_scaler = MinMaxScaler()
   mm_scaled = mm_scaler.fit_transform(X_train[[col]])
   X_train[[col]] = mm_scaled

   # Same Scaler to validation Data
   st_scaled_t = st_scaler.transform(X_val[[col]])
   X_val[[col]] = st_scaled_t
   mm_scaled_t = mm_scaler.transform(X_val[[col]])
   X_val[[col]] = mm_scaled_t

# Feature Importance

In [None]:
# Feature Importance 
ftr_importances_values = 'modelname'.feature_importances_
ftr_importances = pd.Series(ftr_importances_values,index=X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)
plt.show()

# SHAP


# Modeling

In [None]:
# Model Load
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = xgb.XGBClassifier()
lgb = lgb.LGBMClassifier()

# Training 
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)

# Predict 
dt_pred = dt.predict(X_val)
rf_pred = rf.predict(X_val)
xgb_pred = xgb.predict(X_val)
lgb_pred = lgb.predict(X_val)

In [None]:
model_lst = ['Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM']
pred_lst = [dt_pred, rf_pred, xgb_pred, lgb_pred]
parm = [None, 'micro', 'macro', 'weighted']

In [None]:
for m, pred in zip(model_lst, pred_lst):
    print('🔻','\33[91m' + m + '\033[0m', '🔻')
    accuracy = accuracy_score(y_val, pred)
    precision = precision_score(y_val, pred)
    recall = recall_score(y_val, pred)
    f1 = f1_score(y_val, pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print()

    print('='*50)
    print()

In [None]:
# metrics average 추가

for m, pred in zip(model_lst, pred_lst):
    print('\33[91m' + m + '\033[0m')
    for i in parm:
        accuracy = accuracy_score(y_val, pred)
        precision = precision_score(y_val, pred, average=i)
        recall = recall_score(y_val, pred, average=i)
        f1 = f1_score(y_val, pred, average=i)

        print('🔻')
        print(f'parameter : {i}')
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
        print()

    print('='*50)
    print()

# Parameter Tuning

In [None]:
parameter_grid = {'max_depth' : [4, 6, 8],
                  'n_estimators': [50, 10],
                  'max_features': ['sqrt', 'auto', 'log2'],
                  'min_samples_split': [2, 3, 10],
                  'min_samples_leaf': [1, 3, 10],
                  'bootstrap': [True, False],
                  }

forest = RandomForestClassifier()

cross_validation = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(forest,
                           scoring='accuracy',
                           param_grid=parameter_grid,
                           cv=cross_validation,
                           verbose=1
                           )

grid_search.fit(train, targets)
model = grid_search
parameters = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

# Ensemble

In [1]:
# Voting
# 개별 모델은 KNN와 DecisionTree 임
knn_clf = KNeighborsClassifier(n_neighbors=4)
dt_clf = DecisionTreeClassifier(random_state=42)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('KNN', knn_clf),('DT', dt_clf)] , voting='hard')


NameError: name 'KNeighborsClassifier' is not defined

# Evaluation

In [None]:
# accuracy, recall, precision, auc graph, confusion matrix

def eval_model(pred_list, name_list, y_test):
    for pred, name in zip(pred_list, name_list):
        accuracy = accuracy_score(y_test , pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)
        matrix = confusion_matrix(y_test, pred)

        plt.figure(figsize=(8,3))
        plt.subplot(1, 2, 1)
        fpr, tpr, thresholds = roc_curve(y_val, pred, pos_label=1)
        # roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color='darkorange')
        plt.plot([0, 1], [0, 1], color = 'black', label = 'y = x')
        plt.subplot(1, 2, 2)
        sns.heatmap(matrix, annot=True, fmt='g') # fmt='g' : 지수 표기를 일반 표기로
        plt.title('{0} Model Confusion Matrix'.format(name))
        plt.show()
        plt.tight_layout()

        print('{0} 정확도: {1:.4f}'.format(name, accuracy))
        print('{0} Recall: {1:.4f}'.format(name, recall))
        print('{0} Precision: {1:.4f}'.format(name, precision))
        print('{0} AUC: {1:.4f}'.format(name, auc))

        print('='* 50)
        print()