In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing   
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer # melakukan transformasi (fit transform = transformer)
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Utilities
import warnings
warnings.filterwarnings("ignore")
from sklearn.utils.testing import ignore_warnings

Melakukan modelling dengan data titanic, dengan menggunakan model :
1. Decision Tree Classifier(apa persyaratan penumpang akan survive berdasarkan model)
2. Logistic Regression (cari feature yang paling berpengaruh)

lakukan prediksi apakah akan survive atau tidak

In [2]:
df = pd.read_csv('4.titanic.csv')
df

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,,Southampton,yes,True
3,female,35.0,0,53.1000,First,C,Southampton,yes,False
4,male,35.0,0,8.0500,Third,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,,Southampton,no,True
887,female,19.0,0,30.0000,First,B,Southampton,yes,True
888,female,,2,23.4500,Third,,Southampton,no,False
889,male,26.0,0,30.0000,First,C,Cherbourg,yes,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          891 non-null    object 
 1   age          714 non-null    float64
 2   parch        891 non-null    int64  
 3   fare         891 non-null    float64
 4   class        891 non-null    object 
 5   deck         203 non-null    object 
 6   embark_town  889 non-null    object 
 7   alive        891 non-null    object 
 8   alone        891 non-null    bool   
dtypes: bool(1), float64(2), int64(1), object(5)
memory usage: 56.7+ KB


In [4]:
df.isna().sum()

sex              0
age            177
parch            0
fare             0
class            0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
df.describe(include='all')

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
count,891,714.0,891.0,891.0,891,203,889,891,891
unique,2,,,,3,7,3,2,2
top,male,,,,Third,C,Southampton,no,True
freq,577,,,,491,59,644,549,537
mean,,29.699118,0.381594,32.204208,,,,,
std,,14.526497,0.806057,49.693429,,,,,
min,,0.42,0.0,0.0,,,,,
25%,,20.125,0.0,7.9104,,,,,
50%,,28.0,0.0,14.4542,,,,,
75%,,38.0,0.0,31.0,,,,,


## Data Preprocessing
Strategy =
* Encoding
>* One Hot Encoding = sex,class, embark town
>* Simple Imputer = age,Embark Town
* Drop : Perch,deck

In [6]:
df=df.drop(['deck'],axis=1)

In [7]:
df['alone']=df['alone'].astype(int) # diubah ke int karena bentuk boolean

## Data Splitting

In [8]:
X = df.drop(['alive'],axis=1)
y = [1 if i=='yes' else 0 for i in df['alive']]

In [9]:
X_train,X_test,y_train,y_test = train_test_split (X,y,
                                                  test_size=0.2,
                                                  stratify=y,
                                                  random_state=2020)

### Handling Missing Value

In [10]:
enc_imp = Pipeline([
    ('most_frequent',SimpleImputer(strategy='most_frequent')),
    ('enc',OneHotEncoder(drop='first'))
])
transformer = ColumnTransformer([
    ('one hot',OneHotEncoder(drop='first'),['sex','class']),
    ('impute one hot',enc_imp,['embark_town']),
    ('mean', SimpleImputer(strategy='mean'),['age'])
],remainder='passthrough')

## Data Transformation

In [11]:
X_train_prep = transformer.fit_transform(X_train)
X_test_prep = transformer.transform(X_test)

In [12]:
X_train_prep = pd.DataFrame(X_train_prep)
X_test_prep = pd.DataFrame(X_test_prep)

In [13]:
transformer.transformers_[0][1].get_feature_names()

array(['x0_male', 'x1_Second', 'x1_Third'], dtype=object)

In [14]:
transformer.transformers_[1][1][1].get_feature_names()

array(['x0_Queenstown', 'x0_Southampton'], dtype=object)

In [15]:
features = list(transformer.transformers_[0][1].get_feature_names()) + list(transformer.transformers_[1][1][1].get_feature_names()) + ['age','parch','fare','alone']

In [16]:
features

['x0_male',
 'x1_Second',
 'x1_Third',
 'x0_Queenstown',
 'x0_Southampton',
 'age',
 'parch',
 'fare',
 'alone']

In [17]:
X_train_prep.columns = features
X_test_prep.columns = features

In [18]:
X_train_prep

Unnamed: 0,x0_male,x1_Second,x1_Third,x0_Queenstown,x0_Southampton,age,parch,fare,alone
0,1.0,1.0,0.0,0.0,1.0,54.000000,0.0,14.0000,1.0
1,1.0,0.0,1.0,0.0,1.0,22.000000,0.0,9.0000,1.0
2,1.0,0.0,1.0,1.0,0.0,29.000000,0.0,7.7500,1.0
3,1.0,0.0,0.0,0.0,1.0,29.665501,0.0,30.5000,1.0
4,1.0,0.0,1.0,1.0,0.0,40.000000,1.0,15.5000,0.0
...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,0.0,1.0,36.000000,0.0,26.3875,1.0
708,1.0,0.0,0.0,0.0,1.0,29.665501,0.0,26.0000,1.0
709,1.0,0.0,0.0,0.0,1.0,38.000000,0.0,90.0000,0.0
710,1.0,0.0,1.0,0.0,1.0,51.000000,0.0,7.7500,1.0


In [19]:
tree = DecisionTreeClassifier()
reg  = LogisticRegression()

In [20]:
reg.fit(X_train_prep,y_train)
y_pred = reg.predict(X_test_prep)
print('model accuracy with logistic regression =',accuracy_score(y_test,y_pred))

model accuracy with logistic regression = 0.776536312849162


In [26]:
k = range(1,30,1)
testing_accuracy = []
training_accuracy = []
score = 0

for i in k:
    tree = DecisionTreeClassifier(max_depth=i)
    tree.fit(X_train_prep,y_train)
    
    y_predict_train = tree.predict(X_train_prep)
    training_accuracy.append(accuracy_score(y_train,y_predict_train))
    
    y_predict_test  = tree.predict(X_test_prep)
    acc_score = accuracy_score(y_test,y_predict_test)
    testing_accuracy.append(acc_score)
    
    if score < acc_score:
        score = acc_score
        best_depth=i

In [27]:
print('model accuracy with decision tree =',score)

model accuracy with decision tree = 0.7988826815642458


In [28]:
best_depth

7

In [24]:
pd.DataFrame({'feat':features,'coef':reg.coef_.flatten()}).sort_values('coef') #Logistic Regression

Unnamed: 0,feat,coef
0,x0_male,-2.49731
2,x1_Third,-1.710962
1,x1_Second,-0.482573
4,x0_Southampton,-0.466726
8,alone,-0.350288
6,parch,-0.349529
3,x0_Queenstown,-0.078985
5,age,-0.031479
7,fare,0.003456


flattten agar bisa masuk di dataframe

In [25]:
pd.DataFrame({'imp':tree.feature_importances_},index=features).sort_values('imp',ascending=False)

Unnamed: 0,imp
x0_male,0.318959
fare,0.263068
age,0.242429
x1_Third,0.116947
alone,0.016562
parch,0.015927
x0_Southampton,0.012541
x1_Second,0.009159
x0_Queenstown,0.004407
