In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/Social_Network_Ads.csv")

In [4]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
data.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [6]:
X = data.iloc[:, 2:4].values
y = data.iloc[:, -1].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
lr_pred = lr.predict(X_test)
lr_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1], dtype=int64)

In [34]:
cf = confusion_matrix(y_test, lr_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("Logistics Regression CF", test_score)

Logistics Regression CF 91.25


In [25]:
tree = DecisionTreeClassifier(criterion = 'entropy')
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [33]:
cf = confusion_matrix(y_test, tree_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("DEcision Tree CF", test_score)

DEcision Tree CF 88.75


In [30]:
# Kneigbours
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [32]:
cf = confusion_matrix(y_test, knn_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("Kneigbours CF", test_score)

Kneigbours CF 95.0


In [37]:
#SVC
svc_lin = SVC(kernel = 'linear', random_state = 0)
svc_lin.fit(X_train, y_train)
svc_pred = svc_lin.predict(X_test)
svc_lin

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [38]:
cf = confusion_matrix(y_test, svc_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("SVC CF", test_score)

SVC CF 91.25


In [39]:
#guissian 
naiv = GaussianNB()
naiv.fit(X_train, y_train)
naiv_pred = naiv.predict(X_test)
naiv

GaussianNB(priors=None, var_smoothing=1e-09)

In [40]:
cf = confusion_matrix(y_test, naiv_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("Naiv CF", test_score)

Naiv CF 91.25


In [41]:
 #Random forest
randFC = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
randFC.fit(X_train, y_train)
rand_pred = randFC.predict(X_test)
randFC

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [43]:
cf = confusion_matrix(y_test, rand_pred)
TN, FP, FN, TP = cf.ravel()
test_score = (TP + TN ) / (TP + TN + FN + FP) *100
print("randfc CF", test_score)

randfc CF 91.25


In [45]:
print('f1 score: {}'.format(f1_score(y_test, knn_pred)))

f1 score: 0.9130434782608695
