# Introduction to this python notebook

In [None]:
"""
What? Classification in sklearn & XGboost

[1] Sklearn is knwon for its offer of methods
[2] XGboost is known for its speed
"""

# Import python modules

In [1]:
from pandas import read_csv
from xgboost import XGBClassifier
from IPython.display import Markdown, display
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Checking module versions

In [1]:
def printPythonModuleVersion():    
    """printPythonModuleVersion
    Quickly list the python module versions
    """
    import scipy
    print('scipy: %s' % scipy.__version__)
    import numpy
    print('numpy: %s' % numpy.__version__)    
    import matplotlib
    print('matplotlib: %s' % matplotlib.__version__)    
    import pandas
    print('pandas: %s' % pandas.__version__)
    import statsmodels
    print('statsmodels: %s' % statsmodels.__version__) 
    import sklearn
    print('sklearn: %s' % sklearn.__version__)
    import xgboost
    print('xgboostn: %s' % xgboost.__version__)

printPythonModuleVersion()

scipy: 1.4.1
numpy: 1.18.5
matplotlib: 3.3.2
pandas: 1.1.4
statsmodels: 0.12.1
sklearn: 0.23.2
xgboostn: 1.2.1


# Loading the data set

In [5]:
filename = "../DATASETS/pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names = names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
print("Data shape: ", dataframe.shape)

Data shape:  (768, 9)


# Visualisation of the data

In [8]:
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Split the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 7)
print("Size of training data: ", X_train.shape)
print("Size of valisation data: ", X_test.shape)

Size of training data:  (514, 8)
Size of valisation data:  (254, 8)


# Classification accuracy via LogisticRegression from SKLEARN

In [7]:
modelSklearn = LogisticRegression(max_iter = 250)

# fit model on training data
modelSklearn.fit(X_train, y_train)

# make predictions for test data
predictions = modelSklearn.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("\nConfusio matrix \n'", confusion_matrix(y_test, predictions))
print("\nClassification report:\n", classification_report(y_test, predictions))

Accuracy: 78.74%

Confusio matrix 
' [[142  20]
 [ 34  58]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.81      0.88      0.84       162
         1.0       0.74      0.63      0.68        92

    accuracy                           0.79       254
   macro avg       0.78      0.75      0.76       254
weighted avg       0.78      0.79      0.78       254



# Classification accuracy via XGBOOST

In [9]:
modelXgboost = XGBClassifier()

# fit model on training data
modelXgboost.fit(X_train, y_train)
# make predictions for test data
predictions = modelXgboost.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("\nConfusio matrix \n'", confusion_matrix(y_test, predictions))
print("\nClassification report:\n", classification_report(y_test, predictions))

Accuracy: 74.02%

Confusio matrix 
' [[131  31]
 [ 35  57]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.79      0.81      0.80       162
         1.0       0.65      0.62      0.63        92

    accuracy                           0.74       254
   macro avg       0.72      0.71      0.72       254
weighted avg       0.74      0.74      0.74       254

