In [None]:
import pandas as pd

In [None]:
bank = pd.read_csv('bank.csv', sep=';')
bank_add_full = pd.read_csv('bank-additional-full.csv', sep=';')
bank_add = pd.read_csv('bank-additional.csv', sep=';')
bank_full = pd.read_csv('bank-full.csv', sep=';')

# 1. Logistic Regression

In [None]:
# Step 1: Import Packages, Functions, and Classes
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
bank_add_full.info()

In [None]:
bank_add_full['education'].unique()

In [None]:
bank_add_full['education']=np.where(bank_add_full['education'] =='basic.9y', 'Basic', bank_add_full['education'])
bank_add_full['education']=np.where(bank_add_full['education'] =='basic.6y', 'Basic', bank_add_full['education'])
bank_add_full['education']=np.where(bank_add_full['education'] =='basic.4y', 'Basic', bank_add_full['education'])

In [None]:
# data exploration
bank_add_full['y'].value_counts()

In [None]:
count_no_sub = len(bank_add_full[bank_add_full['y']=='no'])
count_sub = len(bank_add_full[bank_add_full['y']=='yes'])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no subscription is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of subscription", pct_of_sub*100)

dapat terlihat bahwa rasio dari no-subscription dan subscription tidak imbang sebesar 89:11

In [None]:
# creaet variabel dummy
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(bank_add_full[var], prefix=var)
    data=bank_add_full.join(cat_list)
    bank_add_full=data
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
data_vars=bank_add_full.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [None]:
data_final=bank_add_full[to_keep]
data_final.columns.values

In [None]:
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
# create a model
model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=0)
model.fit(X_train, y_train)

In [None]:
# step 4 : evaluate the model
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

model.score(X_train, y_train)

Dari hasil observasi dapat diketahui bahwa akurasi model dari data yang diklasifikasikan benar sebesar 91%

### Confusion Matrix

In [None]:
# confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# mendapatkan laporan yg lebih komprehensif dengan classification_report()
print(classification_report(y, model.predict(X)))

In [None]:
# improve the model
model = LogisticRegression(solver='liblinear', C=10.0, multi_class='ovr', random_state=0)
model.fit(X,y)

print("intercept :",model.intercept_)
print("coefisien :",model.coef_)

In [None]:
print(model.predict_proba(X))

In [None]:
print("y pred :",model.predict(X))

In [None]:
print("nilai akurasi:",model.score(X,y))

**Diperoleh nilai akurasi model hasil pengamatan sebesar 91%**

# 2. K-Nearest Neighbors

In [None]:
bank.info()

In [None]:
# change variable into categorical
knn = bank.select_dtypes(include=['object']).copy()
nRows, nCols = knn.shape
for myIndex in range(0,nCols):
    headerName = knn.columns[myIndex]
    knn[headerName] = knn[headerName].astype("category")
    knn[headerName] = knn[headerName].cat.codes
    bank[headerName] = knn[headerName]
knn_numeric = bank
knn_numeric.head()

In [None]:
x = knn_numeric.loc[:, knn_numeric.columns != 'y']
y = knn_numeric.loc[:, knn_numeric.columns == 'y']

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

# spliting data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

# scaling data
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [None]:
model.predict_proba(x_test)

### Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))

**Artinya :**
- Terdapat 788 true negatif pada 788 observasi pertama
- Terdapat 76 false negatif
- Terdapat 20 false positif pada pengamatan
- Terdapat 21 true positif pada 21 observasi terakhir

In [None]:
# evaluate the model
from sklearn import metrics

# model akurasi
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

**Diperoleh nilai akurasi model hasil pengamatan sebesar 85.55%**

# 3. Support Vector Machine

In [None]:
feature_cols = ['age','job','marital','education','default','balance','housing','loan',
                 'contact','day','month','duration','campaign','pdays','previous','poutcome']

X = bank[feature_cols]
y = bank.loc[:, bank.columns == 'y']

# spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 109)

In [None]:
# generating model
from sklearn import svm

# ctreate as svm Classifier
clf = svm.SVC(kernel='linear')

# train the model
clf.fit(X_train, y_train)

# predict the model
y_pred = clf.predict(X_test)

### Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))

**Artinya :**
- Terdapat 1093 true negatif
- Terdapat 77 false negatif
- Terdapat 119 false positif pada pengamatan
- Terdapat 68 true positif pada 21 observasi terakhir

In [None]:
# evaluate the model
from sklearn import metrics

# model akurasi
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

**Diperoleh nilai akurasi model hasil pengamatan sebesar 87.9%**

# 4. Decision Tree

In [None]:
feature_cols = ['age','job','marital','education','default','balance','housing','loan',
                 'contact','day','month','duration','campaign','pdays','previous','poutcome']
X = bank[feature_cols]
y = bank.loc[:, bank.columns == 'y']

# spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [None]:
# training and making prediction
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred

### Confusion Matrix

In [None]:
# evaluate model
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

**Artinya :**
- Terdapat 1093 true negatif
- Terdapat 77 false negatif
- Terdapat 119 false positif pada pengamatan
- Terdapat 68 true positif pada 21 observasi terakhir

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import metrics

# model accuracy
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

**Dapat diketahui bahwa dari total 1357 data observasi terdapat 158 missclasified dengan akurasi model sebesar 88%**

In [None]:
# visualisasi Decision Tree
features = bank.loc[:, bank.columns != 'y']

import sklearn.tree as tree
import pydotplus
from six import StringIO
from IPython.display import Image

dot_data = StringIO()
tree.export_graphviz(classifier,
                     out_file=dot_data,
                     class_names=['0', '1'], 
                     feature_names=feature_cols, 
                     filled=True,
                     rounded=True, 
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

# 5. Random Forest

In [None]:
feature_cols = ['age','job','marital','education','default','balance','housing','loan',
                 'contact','day','month','duration','campaign','pdays','previous','poutcome']
X = bank[feature_cols]
y = bank.loc[:, bank.columns == 'y']

# spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# create a gaussian classifier
rf=RandomForestClassifier(n_estimators=100)

# train the model
rf.fit(X_train,y_train)

# predict
y_pred = rf.predict(X_test)

### Confusion Matrix

In [None]:
# confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

**Artinya :**
- Terdapat 1093 true negatif
- Terdapat 77 false negatif
- Terdapat 119 false positif pada pengamatan
- Terdapat 68 true positif pada 21 observasi terakhir

In [None]:
from sklearn import metrics

# model accuracy
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

**Diperoleh nilai akurasi model hasil pengamatan sebesar 90.7%**

# 6. Naive Bayes

In [None]:
x = bank.loc[:, bank.columns != 'y']
y = bank.loc[:, bank.columns == 'y']

# spliting data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

In [None]:
# Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# create gaussian classifier
model = GaussianNB()

# train the model
model.fit(x_train,y_train)

# predict output
y_pred = model.predict(x_test)
print(y_pred)

### Confusion Matrix

In [None]:
# confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

**Artinya :**
- Terdapat 1093 true negatif
- Terdapat 77 false negatif
- Terdapat 119 false positif pada pengamatan
- Terdapat 68 true positif pada 21 observasi terakhir

In [None]:
from sklearn import metrics

# model accuracy
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))

Dapat diketahui bahwa akurasi hasil observasi data sebesar 85.55%

# 7. Algorithm Analysis

Setelah dilakukan analasis dengan berbagai metode pengklasifikan, diperoleh nilai akurasi dari masing-masing model sebagai berikut:

1. Logistic Regregression = 91%
2. K-Nearest Neighbors = 85.%
3. Support Vector Machine = 87.9%
4. Decision Tree = 88%
5. Random Forest = 90.7%
6. Naive Bayes = 85.55%

Berdasarkan besaran nilai akurasi pada tiap-tiap model, didapatkan model terbaik yaitu model Logistic Regression dengan nilai akurasi sebesar 91%