# Machine learning - clasifiaction

## Data loading and processing

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas_profiling
import numpy as np

In [2]:
column_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', names = column_names)

In [3]:
pandas_profiling.ProfileReport(data);

In [4]:
for i in column_names:
    data[i]=data[i].replace('?', -1)

In [5]:
X = data.drop('A16', axis = 1)
y = data['A16']

y = y.factorize()[0]
X['A1'] = X['A1'].factorize()[0]
X['A4'] = X['A4'].factorize()[0]
X['A5'] = X['A5'].factorize()[0]
X['A6'] = X['A6'].factorize()[0]
X['A7'] = X['A7'].factorize()[0]
X['A9'] = X['A9'].factorize()[0]
X['A10'] = X['A10'].factorize()[0]
X['A12'] = X['A12'].factorize()[0]
X['A13'] = X['A13'].factorize()[0]
X['A14'] = X['A14'].factorize()[0]

X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,0,30.83,0.0,0,0,0,0,1.25,0,0,1,0,0,0,0
1,1,58.67,4.46,0,0,1,1,3.04,0,0,6,0,0,1,560
2,1,24.5,0.5,0,0,1,1,1.5,0,1,0,0,0,2,824
3,0,27.83,1.54,0,0,0,0,3.75,0,0,5,1,0,3,3
4,0,20.17,5.625,0,0,0,0,1.71,0,1,0,0,1,4,0


## Traing of Decision tree using full data

In [6]:
tree = DecisionTreeClassifier()
tree.fit(X=X, y=y);

In [7]:
tree.score(X=X, y=y)

1.0

## Training of Decision tree using learning and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [9]:
tree = DecisionTreeClassifier()
tree.fit(X=X_train, y=y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
tree.score(X=X_test, y=y_test)

0.8043478260869565

## Selection of the best hiperparameters for tree

In [11]:
%%time
scores = pd.DataFrame()
the_best = 0

for max_depth in range (2,7):
    print(max_depth)
    for min_samples_split in range(2,10):
        for min_samples_leaf in range(1,10):
            for criterion in ['entropy', 'gini']:
                tree = DecisionTreeClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                               min_samples_leaf=min_samples_leaf, criterion = criterion)
                tree.fit(X=X_train, y=y_train)
                score = tree.score(X=X_test, y=y_test)
                scores = scores.append({'wynik': score, 
                                        'max depth': max_depth, 
                                        'min_samples_split': min_samples_split,
                                       'min_samples_leaf': min_samples_leaf, 
                                        'criterion': criterion}, ignore_index=True)
                if the_best < score:
                    the_best = score
                    hiperparameters = {'wynik': score, 
                                        'max depth': max_depth, 
                                        'min_samples_split': min_samples_split,
                                       'min_samples_leaf': min_samples_leaf, 
                                        'criterion': criterion}
print(the_best)
print(hiperparameters)


2
3
4
5
6
0.8623188405797102
{'wynik': 0.8623188405797102, 'max depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 5, 'criterion': 'gini'}
Wall time: 6.73 s


## Using GridSearchCV instead of iterating all possible parameters

In [12]:
%%time
estimator = DecisionTreeClassifier(random_state = 1)
param_grid = {'max_depth': range(1,10),
             'min_samples_split': range(2,10),
             'min_samples_leaf': range(1,10),
             'criterion': ['entropy', 'gini']}

tree = GridSearchCV(estimator = estimator, param_grid = param_grid, cv = 5)

tree.fit(X=X, y=y)
score = tree.score(X=X, y=y)

Wall time: 1min 4s




In [13]:
score

0.855072463768116

In [14]:
tree.best_params_

{'criterion': 'entropy',
 'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

## Evaluation of stability and comapring with DummyClassifier

In [15]:
scores = pd.DataFrame()

for random_state in range(0,5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)
    drzewo = DecisionTreeClassifier(max_depth=6, min_samples_split=3, min_samples_leaf=5, criterion='gini', random_state=1)
    tree.fit(X=X_train, y=y_train)
    scores_tree = tree.score(X=X_test, y=y_test)
    
    dummy = DummyClassifier(strategy='stratified', random_state=1)
    dummy.fit(X=X_train, y=y_train)
    scores_dummy = dummy.score(X=X_test, y=y_test)
    
    scores = scores.append({'random_state': random_state, 'scores_tree': scores_tree, 'scores_dummy': scores_dummy}, 
                           ignore_index=True);



In [16]:
scores

Unnamed: 0,random_state,scores_dummy,scores_tree
0,0.0,0.485507,0.862319
1,1.0,0.442029,0.818841
2,2.0,0.5,0.855072
3,3.0,0.485507,0.884058
4,4.0,0.543478,0.782609


In [17]:
pandas_profiling.ProfileReport(scores);

## Random Forest training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

a = int(np.round((np.sqrt(len(X.columns)))))

forest = RandomForestClassifier(max_depth=6, min_samples_split=3, min_samples_leaf=5, criterion='gini', random_state=1,
                            bootstrap=True, max_features=a, n_estimators=100)

forest.fit(X=X_train, y=y_train)
score = forest.score(X=X_test, y=y_test)

score

0.8840579710144928