In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
%matplotlib inline
from sklearn.linear_model import LogisticRegression

In [2]:
glass = pd.read_csv('dati/glass.data.csv')

In [3]:
glass.columns

Index(['Id number', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass'], dtype='object')

In [4]:
glass['Type of glass'].unique()

array([1, 2, 3, 5, 6, 7])

In [5]:
del glass['Id number']

In [6]:
glass.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass'], dtype='object')

In [7]:
pd.get_dummies(glass, columns=['Type of glass'])

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass_1,Type of glass_2,Type of glass_3,Type of glass_5,Type of glass_6,Type of glass_7
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,1,0,0,0,0,0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1,0,0,0,0,0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1,0,0,0,0,0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1,0,0,0,0,0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1,0,0,0,0,0
5,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1,0,0,0,0,0
6,1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,1,0,0,0,0,0
7,1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,1,0,0,0,0,0
8,1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,1,0,0,0,0,0
9,1.51755,13.00,3.60,1.36,72.99,0.57,8.40,0.00,0.11,1,0,0,0,0,0


In [8]:
glass = pd.read_csv('dati/glass.data.csv')

Multiclass classification is used when predicting 3 or more categories. Fortunately there are techniques to apply binary classification algorithms to these multiclass cases. The first which we will introduce is the one-versus-all technique. The one-versus-all method is a technique where we choose a single category as our true observation and the rest of the categories as false observations, splitting the problem into multiple binary classification problems. The model will then output a probability of whether the observation falls into the chosen class and continued for each other class. At the end, an observation is assigned to the class with the largest probability. 

In [9]:
np.random.seed(1)
random.seed(1)
glass = glass.loc[np.random.permutation(glass.index)]
# Select 70% of the dataset to be training data
highest_train_row = int(glass.shape[0] * .7)
train = glass.iloc[:highest_train_row,:]

# Select 30% of the dataset to be test data.
test = glass.iloc[highest_train_row:,:]

In [10]:
train.columns

Index(['Id number', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type of glass'], dtype='object')

In [11]:
model = {}
for i in glass['Type of glass'].unique():
    model[i] = LogisticRegression()
    X_train  = train.drop(['Type of glass'], axis = 1)
    y_train = train['Type of glass'] == i
    model[i].fit(X_train, y_train)
testing_probs = pd.DataFrame(columns=glass['Type of glass'].unique())
for i in glass['Type of glass'].unique():
    X_test = test.drop(['Type of glass'], axis = 1)
    testing_probs[i] = model[i].predict_proba(X_test)[:,1]

In [12]:
testing_probs

Unnamed: 0,2,7,1,3,5,6
0,0.298677,2.895995e-38,1.000000e+00,0.000034,1.471889e-08,0.000029
1,0.064983,5.645808e-02,5.352299e-32,0.014096,1.903949e-01,0.332705
2,0.378609,1.996346e-17,3.912538e-10,0.025586,1.813728e-04,0.008725
3,0.241023,5.492061e-03,3.024001e-29,0.007366,5.949982e-04,0.000517
4,0.582207,1.075853e-10,1.782167e-19,0.376333,2.607396e-03,0.018215
5,0.584316,1.678589e-17,4.082873e-10,0.000334,5.328741e-01,0.000902
6,0.241892,9.968923e-01,1.634781e-37,0.006022,9.980014e-01,0.016058
7,0.372123,3.717618e-26,9.872709e-01,0.005729,2.342224e-06,0.001225
8,0.304010,4.566481e-37,1.000000e+00,0.000046,3.566658e-08,0.000037
9,0.361885,1.812556e-31,1.000000e+00,0.000252,6.798166e-07,0.000120


In [13]:
predicted_origins = testing_probs.idxmax(axis = 1)

In [14]:
predicted_origins

0     1
1     6
2     2
3     2
4     2
5     2
6     5
7     1
8     1
9     1
10    3
11    2
12    2
13    1
14    1
...
50    1
51    3
52    1
53    2
54    2
55    2
56    2
57    7
58    2
59    2
60    7
61    2
62    2
63    2
64    1
Length: 65, dtype: int64

In [29]:
#matrice di confusione:
new_test = test.reset_index() #la crosstab usa l'indexS per fare gli incroci
pd.crosstab(predicted_origins, new_test['Type of glass'])

Type of glass,1,2,3,5,6,7
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,24,0,0,0,0,0
2,0,25,2,1,0,0
3,0,0,2,0,2,0
5,0,0,0,1,1,1
6,0,0,0,0,1,0
7,0,0,0,0,0,5


# Metrics with Sklearn

Since we now understand how to compute precision, recall, and F-scores, we can use builtin sklearn functions to compute them. sklearn.metrics has many scoring metrics for all different kinds of classifiers. Here we can use precision_score, recall_score, and f1_score. Each of these metrics need two inputs, the true class and the predicted class. There are many other options which we recommend you to read through at http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html. Most importantly we must pay attention to the option average. average is a parameter which tells the function how to compute the score. The options are  binary -- Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary.  micro -- Calculate metrics globally by counting the total true positives, false negatives and false positives.  macro -- Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.  weighted -- Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall. * samples -- Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from accuracy_score).

In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Compute precision score with micro averaging
pr_micro = precision_score(test["Type of glass"], predicted_origins, average='micro')
pr_micro

0.89230769230769236

In [34]:
# Compute recall_score with weighted averaging
recall_w = recall_score(test["Type of glass"], predicted_origins, average='weighted')
recall_w

0.89230769230769236