<a href="https://colab.research.google.com/github/hsaurs/Projects/blob/main/AI_06_%EC%9C%A0%ED%98%95%EC%84%9D_Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders==2.*
!pip install eli5
!pip install pandas-profiling==2.*
!pip install pdpbox
!pip install shap

In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
names = ['age', 'workclass', 'fnlwgt', 'education', 'education num', 
                 'marital status', 'occupation', 'relationship', 'race', 'sex', 
                 'capital gain', 'capital loss', 'hours per week', 
                 'native country', 'income']
df = pd.read_csv('adult.data', header=None, names=names,na_values=['?', ' ?'])

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.reset_index(drop=True,inplace=True)
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.head(4)

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
columns = ['workclass', 'education', 'marital status', 
           'occupation', 'relationship', 'race', 'sex',
           'native country']

In [None]:
df.describe(exclude='number').T.sort_values(by='unique')

In [None]:
for i in columns:
    print(i,":",df[i].nunique(), np.unique(df[i]))

In [None]:
for i in columns:
    df[i] =  df[i].str.replace(pat=r'[^\w]', repl=r'', regex=True)

In [None]:
for i in columns:
    print(i,":",df[i].nunique(), np.unique(df[i]))

In [None]:
df['Income'] = df['income'] == ' >50K'
df['Income'].nunique()

In [None]:
df['Income'].value_counts()

In [None]:
##  baseline

22633/(22633+7506)

In [None]:
df['Income'] = df['Income']*1

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df,test_size=.2, random_state=2)
train.shape, val.shape

In [None]:
feature = ['age','fnlwgt','education num','capital gain','capital loss','hours per week','Income']
for i in feature:
    df[i] = df[i].astype(float)

In [None]:
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline

target = 'Income'
features = df.columns.drop([target,'income'])
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [None]:
## 1. DecisionTreeClassifier Model

from sklearn.tree import DecisionTreeClassifier

pipe1 = make_pipeline(
    OrdinalEncoder(),
    DecisionTreeClassifier(min_samples_leaf=10,max_depth=5,random_state=2)
)

pipe1.fit(X_train, y_train)
print('훈련 정확도: ', pipe1.score(X_train, y_train))
print('검증 정확도: ', pipe1.score(X_val,y_val))

In [None]:
## RandomizedSearchCV 사용 Hyperparameters 조정

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

dists = {
    'decisiontreeclassifier__min_samples_leaf': randint(1, 50),
    'decisiontreeclassifier__max_depth': randint(5,10),
    'decisiontreeclassifier__min_samples_split': randint(1, 5),
    'decisiontreeclassifier__max_features': uniform(0, 1)
}

clf1 = RandomizedSearchCV(
    pipe1,
    param_distributions=dists,
    n_iter=50,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

clf1.fit(X_train,y_train);
print('최적 하이퍼파라미터: ', clf1.best_params_)

In [None]:
## Hyperparameters 조정된 모델 재학습 및 검증 정확도 산출 

pipe1 = clf1.best_estimator_
pipe1.fit(X_train,y_train)
print('훈련 정확도: ', pipe1.score(X_train, y_train))
print('검증 정확도: ', pipe1.score(X_val,y_val))

In [None]:
## DecisionTree 특성 중요도

import matplotlib.pyplot as plt

n= 10
rf = pipe1.named_steps['decisiontreeclassifier']
importances = pd.Series(rf.feature_importances_, X_train.columns)

plt.figure(figsize=(10,n/4))
plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [None]:
## 2. RandomForestClassifier Model

from sklearn.ensemble import RandomForestClassifier

pipe2 = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=2)
)

pipe2.fit(X_train, y_train)
print('훈련 정확도: ', pipe2.score(X_train, y_train))
print('검증 정확도: ', pipe2.score(X_val,y_val))

In [None]:
## RandomizedSearchCV 사용 Hyperparameters 조정

dists = {
    'randomforestclassifier__n_estimators': randint(50, 500),
    'randomforestclassifier__max_depth': [5, 10, 15, 20, None], 
    'randomforestclassifier__max_features': uniform(0, 1)
}

clf2 = RandomizedSearchCV(
    pipe2,
    param_distributions=dists,
    n_iter=50,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

clf2.fit(X_train,y_train);
print('최적 하이퍼파라미터: ', clf2.best_params_)

In [None]:
## Hyperparameters 조정된 모델 재학습 및 검증 정확도 산출 

pipe2 = clf2.best_estimator_
pipe2.fit(X_train,y_train)
print('훈련 정확도: ', pipe2.score(X_train, y_train))
print('검증 정확도: ', pipe2.score(X_val,y_val))

In [None]:
## RandomForest 특성 중요도

import matplotlib.pyplot as plt

n= 10
rf = pipe2.named_steps['randomforestclassifier']
importances = pd.Series(rf.feature_importances_, X_train.columns)

plt.figure(figsize=(10,n/4))
plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [None]:
##3. Gradient Boosting Model

from xgboost import XGBClassifier

pipe3 = make_pipeline(
    OrdinalEncoder(),
    XGBClassifier(random_state=2)
)

pipe3.fit(X_train, y_train)
print('훈련 정확도: ', pipe3.score(X_train, y_train))
print('검증 정확도: ', pipe3.score(X_val,y_val))

In [None]:
## RandomizedSearchCV 통해 Hyperparameters 조정

dists = {
    'xgbclassifier__n_estimators': randint(50,500),
    'xgbclassifier__colsample_bytree': uniform(0.5,1),
    'xgbclassifier__subsample': uniform(0.6,1),
    'xgbclassifier__learning_rate': uniform(0,1),
    'xgbclassifier__max_depth': [5, 10, 15, 20, None], 
}

clf3 = RandomizedSearchCV(
    pipe3,
    param_distributions=dists,
    n_iter=50,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

clf3.fit(X_train,y_train);
print('최적 하이퍼파라미터: ', clf3.best_params_)

In [None]:
## Hyperparameters 조정된 모델 재학습 및 검증 정확도 산출 

pipe3 = clf3.best_estimator_
pipe3.fit(X_train,y_train)
print('훈련 정확도: ', pipe3.score(X_train, y_train))
print('검증 정확도: ', pipe3.score(X_val,y_val))

In [None]:
## XGBoost 특성 중요도

n= 10
rf = pipe3.named_steps['xgbclassifier']
importances = pd.Series(rf.feature_importances_, X_train.columns)

plt.figure(figsize=(10,n/4))
plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [None]:
y_pred_proba = pipe3.predict_proba(X_val)[:, -1]
print('AUC score: ', sklearn.metrics.roc_auc_score(y_val, y_pred_proba))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)

roc = pd.DataFrame({
    'FPR(Fall-out)': fpr, 
    'TPRate(Recall)': tpr, 
    'Threshold': thresholds
})
roc

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print('idx:', optimal_idx, ', threshold:', optimal_threshold)

In [None]:
from sklearn.metrics import classification_report

y_pred_05 = y_pred_proba >= 0.5
print(classification_report(y_val, y_pred_05))

In [None]:
y_pred_optimal = y_pred_proba >= optimal_threshold
print(classification_report(y_val, y_pred_optimal))

In [None]:
plt.scatter(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('FPR(Fall-out)')
plt.ylabel('TPR(Recall)');

In [None]:
auc_score = roc_auc_score(y_val, y_pred_optimal)
auc_score

In [None]:
auc_score = roc_auc_score(y_val, y_pred_proba)
auc_score

In [None]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
pcm = plot_confusion_matrix(pipe3, X_val, y_val,
                            cmap=plt.cm.Blues,
                            ax=ax,values_format='d');
plt.title(f'Confusion matrix, n = {len(y_val)}', fontsize=15)

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)

plt.scatter(fpr, tpr, color='blue')
plt.plot(fpr, tpr, color='green')
plt.title('ROC curve')
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
y_pred = pipe3.predict(X_val)
print(classification_report(y_val, y_pred))

In [None]:
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox import pdp
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='xgboost')
import eli5
from eli5.sklearn import PermutationImportance
import shap

In [None]:
y_train.value_counts(normalize=True)


In [None]:
custom = len(y_train)/(2*np.bincount(y_train))
custom

In [None]:
model = pipe3.fit(X_train,y_train)

In [None]:
df.columns.drop([target,'income'])

In [None]:
df.dtypes

In [None]:
from category_encoders import OrdinalEncoder
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train) # 학습데이터
X_val_encoded = encoder.transform(X_val) # 검증데이터

boosting = XGBRegressor(
    n_estimators=1000,
    objective='reg:squarederror', # default
    learning_rate=0.2,
    n_jobs=-1
)

eval_set = [(X_train_encoded, y_train), 
            (X_val_encoded, y_val)]

boosting.fit(X_train_encoded, y_train, 
          eval_set=eval_set,
          early_stopping_rounds=50
         )

In [None]:
df.dtypes

In [None]:
plt.rcParams['figure.dpi'] = 144
for i in ['age','fnlwgt','education num','capital gain','capital loss','hours per week']:
    feature = i
    isolated = pdp_isolate(
        model=pipe3, 
        dataset=X_val, 
        model_features=X_val.columns, 
        feature=feature,
        grid_type='percentile', 
        num_grid_points=10
    )
    pdp_plot(isolated, feature_name=feature);

In [None]:
from pdpbox.pdp import pdp_interact, pdp_interact_plot

features = ['age','hours per week']

interaction = pdp_interact(
    model=boosting, 
    dataset=X_val_encoded,
    model_features=X_val.columns, 
    features=features
)

pdp_interact_plot(interaction, plot_type='grid', 
                  feature_names=features);

In [None]:
interaction.pdp

In [None]:
row = X_val[['age','fnlwgt','education num','capital gain','capital loss','hours per week']].iloc[:500]
row.head()

In [None]:
features = df.columns.drop(['Income','income'])
target = 'Income'
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

In [None]:
enc = OrdinalEncoder()
X_train = enc.fit_transform(X_train)
X_val = enc.fit_transform(X_val)

In [None]:
shap_values = explainer.shap_values(X_train.iloc[:500])
shap.summary_plot(shap_values, X_train.iloc[:500],plot_type="violin")

In [None]:
shap.summary_plot(shap_values, X_train.iloc[:300], plot_type="bar")