## Glass Classification using xgboost

###### This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)


In [1]:
# import libraries
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# peview of the data
data = pd.read_csv("glass.csv")
print(data.describe())

               RI          Na          Mg          Al          Si           K  \
count  214.000000  214.000000  214.000000  214.000000  214.000000  214.000000   
mean     1.518365   13.407850    2.684533    1.444907   72.650935    0.497056   
std      0.003037    0.816604    1.442408    0.499270    0.774546    0.652192   
min      1.511150   10.730000    0.000000    0.290000   69.810000    0.000000   
25%      1.516523   12.907500    2.115000    1.190000   72.280000    0.122500   
50%      1.517680   13.300000    3.480000    1.360000   72.790000    0.555000   
75%      1.519157   13.825000    3.600000    1.630000   73.087500    0.610000   
max      1.533930   17.380000    4.490000    3.500000   75.410000    6.210000   

               Ca          Ba          Fe        Type  
count  214.000000  214.000000  214.000000  214.000000  
mean     8.956963    0.175047    0.057009    2.780374  
std      1.423153    0.497219    0.097439    2.103739  
min      5.430000    0.000000    0.000000    1

In [3]:
# let's print the head of the data.
print(data.head())

        RI     Na    Mg    Al     Si     K    Ca   Ba   Fe  Type
0  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0     1
1  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0     1
2  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0     1
3  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0     1
4  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0     1


#### There are no null values, so we don't have to worry about that

print(data.info())

In [4]:
# As we have observed from the type(label) of the data, that there is 7 classes, but from class 4 there is no datapoints.
print(data["Type"].value_counts())

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64


#### Write a function to load the dataset from the csv file and split the dataset into train and test set(X_train, Y_train, X_test, Y_test)

In [6]:
# load dataset
def load_dataset(csv_name = "glass.csv", train_test_split = 2/3, is_normalization=True ):
    dataset = pd.read_csv(csv_name)
    dataset_shuffle=dataset.iloc[np.random.permutation(len(dataset))]
    dataset=dataset_shuffle.reset_index(drop=True)
    X, Y = dataset[["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"]].values, dataset[["Type"]].values
    num_train = int(len(X)*train_test_split)
    X_train, Y_train = X[:num_train], Y[:num_train]
    X_test, Y_test = X[num_train:], Y[num_train:]
    Y_train, Y_test = Y_train.reshape(-1,), Y_test.reshape(-1,)
    if is_normalization:
        x_mu, x_sigma = np.mean(X_train), np.std(X_train)
        y_mu, y_sigma = np.mean(Y_train), np.std(Y_train)
        X_train = (X_train - x_mu) / x_sigma 
        X_test = (X_test - x_mu) / x_sigma
        stats = (x_mu, y_mu, x_sigma, y_sigma)
    else:
        stats = ()
        
    return X_train, Y_train, X_test, Y_test, stats

# split the dataset in train and test
train_test_split = 2/3   
X_train, Y_train, X_test, Y_test, stats = load_dataset(csv_name = "glass.csv", train_test_split = train_test_split, is_normalization=False)

print("X_train shape:{}\nY_train shape:{}\nX_test shape:{}\nY_test shape:{}".format(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape))

X_train shape:(142, 9)
Y_train shape:(142,)
X_test shape:(72, 9)
Y_test shape:(72,)


In [7]:
clf = XGBClassifier()
clf = clf.fit(X_train, Y_train)

### Print the feature importance, Confusion matrix, Classification report.
#### A confusion matrix is a table, which is oftenly used to describe the performance of the classifier on the test data, when true labels are given for the test data.

In [8]:
print("features importance:", clf.feature_importances_)
Y_pred = clf.predict(X_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(Y_test, Y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(Y_test, Y_pred))

features importance: [0.18766999 0.14188577 0.16999093 0.11332729 0.07298277 0.09972802
 0.13236627 0.03762466 0.0444243 ]
=== Confusion Matrix ===
[[22  2  0  0  0  0]
 [ 4 23  1  0  0  0]
 [ 2  1  0  0  0  0]
 [ 0  0  0  3  0  1]
 [ 0  0  0  0  3  0]
 [ 0  0  0  0  0 10]]


=== Classification Report ===
              precision    recall  f1-score   support

           1       0.79      0.92      0.85        24
           2       0.88      0.82      0.85        28
           3       0.00      0.00      0.00         3
           5       1.00      0.75      0.86         4
           6       1.00      1.00      1.00         3
           7       0.91      1.00      0.95        10

   micro avg       0.85      0.85      0.85        72
   macro avg       0.76      0.75      0.75        72
weighted avg       0.83      0.85      0.83        72



In [9]:
# calculate Accuracy
accuracy = np.zeros((len(Y_test),1))
for i in range(len(Y_pred)):
    if Y_pred[i] == Y_test[i]:
        accuracy[i] = 1
Accuracy = (np.sum(accuracy)/len(accuracy)*100)  
print("Accuracy:" ,Accuracy)

Accuracy: 84.72222222222221


#### Note: We can increase the accuracy of the classifier with the help of hyperparameter tuning.