### Part5: One vs All Logistic Regression for Multi class classification

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Read the data

In [17]:
df=pd.read_csv("wine-quality/data.csv",delimiter=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.2,0.25,0.34,1.2,0.026,31.0,93.0,0.9916,2.93,0.37,11.3,7
1,6.6,0.2,0.27,10.9,0.038,29.0,130.0,0.99496,3.11,0.44,10.5,7
2,5.7,0.22,0.22,16.65,0.044,39.0,110.0,0.99855,3.24,0.48,9.0,6
3,7.2,0.23,0.39,14.2,0.058,49.0,192.0,0.9979,2.98,0.48,9.0,7
4,7.6,0.35,0.47,13.3,0.037,42.0,116.0,0.99822,3.04,0.5,9.2,5


In [18]:
split = int(0.8 * df.shape[0])
training_data = df[:split] # 80% of the total data
testing_data = df[split:]  # 20% of the total data

### Feature Normalisation
${x_i}$= $\frac{x_i - \mu}{\sigma}$ 

Feature Normalisation is done because data in some columns is very small in comparison to other columns data.

In [19]:
columns=training_data.shape[1]
X_train=training_data.iloc[:,0:columns-1]# features Sets

mu=X_train.mean()
sigma=X_train.std()

# features normalisation
X_train=(X_train-X_train.mean())/X_train.std()

Y_train=training_data.iloc[:,columns-1:columns] # outputSet
X_train.insert(0, 'Ones', 1)
print(X_train.shape)
print(Y_train.shape)
X_train.head()

(3526, 12)
(3526, 1)


Unnamed: 0,Ones,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1,2.777966,-0.277666,0.045565,-1.04198,-0.887299,-0.257466,-1.090609,-0.840062,-1.706637,-1.037796,0.658073
1,1,-0.315107,-0.770221,-0.528229,0.899437,-0.354345,-0.375658,-0.206833,0.315732,-0.508057,-0.426652,0.002584
2,1,-1.385786,-0.573199,-0.938081,2.050276,-0.087868,0.215298,-0.68455,1.550642,0.357583,-0.077427,-1.226457
3,1,0.398679,-0.474688,0.455418,1.559919,0.533911,0.806254,1.274088,1.327051,-1.373698,-0.077427,-1.226457
4,1,0.874537,0.707444,1.111182,1.379787,-0.398758,0.392585,-0.541235,1.437126,-0.974172,0.097186,-1.062585


In [20]:
X_train = np.matrix(X_train.values)
Y_train = np.matrix(Y_train.values)

#### Sigmoid Function 
${g(z)}$=${(1+e^{-z})^{-1} }$

In [21]:
def sigmoid(z):
    g=0.0
    g=1+np.exp(-z)
    return np.power(g,-1)

#### CostFunction
${J(\theta)} =-{\frac{1}{m}}{\sum_{i=0}^{m}}[{y_i}{log\hat{y_i}}+(1-{y_i}){log(1-\hat{y_i}})]$

In [22]:
def CostFunction(X,y,theta):
    prediction1= np.log(sigmoid(X*theta.T))
    prediction2= np.log(1-sigmoid(X*theta.T))
    prediction1=np.multiply(y,prediction1)
    prediction2=np.multiply((1-y),prediction2)
    return -(sum(prediction1+prediction2))/len(X)

#### Gradient Descent algo
repeat Untill Converges{


${{\theta_j} :=}{{\theta_j}}$-${\alpha}$*${\frac{\partial}{\partial {\theta_j}} J(\theta)}$


}

${\alpha}$: Learning rate constant

In [23]:
def gradientDescent(X, y, theta, alpha, iters):
    
    Jhistory=np.zeros(iters)
    
    m=len(X)
    for i in range(iters):
        pre = (X*theta.T)-y
        #print(pre.shape)
        delta=np.dot(np.transpose(pre),X)
        
        theta=theta-(alpha/m)*delta

        
        Jhistory[i] = CostFunction(X, y, theta)
        
    return theta,Jhistory

In [24]:
noofClasses=11 #given from 0 to 10

##### Inputs:

1. L, a learner (training algorithm for binary classifiers)
2. samples X
3. labels y where ${y_i \in} {1, … K}$ is the label for the sample ${X_i}$

**Output**:
1. a list of classifiers ${f_k}$ for k ∈ {1, …, K}

**Procedure**:
1. For each k in {1, …, K}
2. Construct a new label vector z where ${{z_i} = {y_i}}$ if ${{y_i} = k and  {z_i} = 0}$ otherwise
3. Apply L to X, z to obtain fk
4. Making decisions means applying all classifiers to an unseen sample x and predicting the label k for which the corresponding classifier reports the highest Probability:

${\displaystyle {\hat {y}}={\underset {k\in \{1\ldots K\}}{\arg \!\max }}\;
f_{k}(x)} $

In [25]:
alpha=.009
iters=1000
theta = np.matrix(np.array([0,0,0,0,0,0,0,0,0,0,0,0]))
finalTheta=np.zeros((noofClasses,X_train.shape[1]))
for q in range(0,11):
    Y=(Y_train==q)
    Y=1*Y
    minTheta, cost2= gradientDescent(X_train, Y, theta, alpha, iters)
    finalTheta[q]=minTheta
#print(finalTheta)

In [26]:
print(finalTheta.shape)

(11, 12)


#### Prepare the test data

In [27]:
columns=testing_data.shape[1]
X_test=testing_data.iloc[:,0:columns-1]# features Sets
Y_test=testing_data.iloc[:,columns-1:columns] # outputSet

#feature normalisation
X_test=(X_test-mu)/sigma
X_test.insert(0, 'Ones', 1)

print(X_test.shape)
X_test.head()

(882, 12)


Unnamed: 0,Ones,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
3526,1,-0.553036,1.495531,-0.118376,-1.001951,0.089783,-1.498474,-1.711641,-0.702467,-0.707821,-0.688571,0.740009
3527,1,-0.672,-1.065754,-0.118376,-1.082009,-0.26552,0.215298,-0.493463,-1.782583,0.823698,-0.775877,2.132922
3528,1,-0.790965,0.707444,-0.69217,2.400532,0.223021,-0.434753,1.03523,1.863669,-0.374882,1.144861,-0.980649
3529,1,-1.385786,-0.277666,-0.610199,1.21967,0.134196,1.013089,-0.445692,0.986504,-0.707821,-0.339345,-0.898713
3530,1,-0.315107,-0.277666,-0.200347,1.199656,0.578324,0.983541,1.011344,1.499044,2.155453,-0.164733,-0.570969


In [28]:
def prediction(test_data):
    z= np.dot(test_data,finalTheta.T)
    z=sigmoid(z)
    return z

In [29]:
holder=np.zeros((X_test.shape[0],11))

X_test=np.matrix(X_test)
holder=prediction(X_test)

output=np.argmax(holder,axis=1)
print(output.shape)

(882, 1)


In [30]:
output=np.matrix(output)
print(Y_test.shape)
print(output.shape)

(882, 1)
(882, 1)


In [31]:
accuracy =np.count_nonzero(Y_test==output)/len(Y_test)*100
accuracy

53.06122448979592

#### one vs one Logistic Regression for Multi class classification

> In one vs one you have to train a separate classifier for each different pair of labels. This leads to ${\frac{N*(N-1)}{2}}$ classifiers.This is much more computationally expensive.



In [32]:
#Y_newtrain=Y_train*1000

In [33]:
datasets=np.hstack((X_train,Y_train))
print(datasets.shape)

(3526, 13)


In [34]:
classes=[3,4,5,6,7,8,9]

In [35]:
N=len(classes)
loop=int(N*(N-1)/2)


In [36]:
def learn_OneVsOne(X_one):
    theta = np.matrix(np.array([0,0,0,0,0,0,0,0,0,0,0,0]))
    one_Vs_oneTheta=np.zeros((loop,X_train.shape[1]))
    print(one_Vs_oneTheta.shape)
    count=0
    for i in range(3,10):
        j = i+1
        while j <10:
            a=X_one[np.where(X_one==i)[0],:]
            b=X_one[np.where(X_one==j)[0],:]
            c=np.vstack((a,b))
            Q=c[:,:c.shape[1]-1]
            Y=c[:,c.shape[1]-1:]
            Y=(Y==i)
            Y=1*Y
            minTheta, cost2= gradientDescent(Q,Y, theta, alpha, iters)
            one_Vs_oneTheta[count]=minTheta
            j+=1
            count+=1
    
    return one_Vs_oneTheta

In [37]:
one_Vs_oneTheta=learn_OneVsOne(datasets)
one_Vs_oneTheta.shape

(21, 12)


(21, 12)

In [40]:
one_Vs_oneTheta

array([[ 1.61595210e-01,  4.28066480e-02, -2.25177092e-02,
        -1.32418032e-02,  3.17869108e-02,  3.56121650e-02,
         8.42964491e-02, -3.26182380e-02,  7.60297501e-03,
         1.16431575e-02, -2.05174121e-02,  5.01598562e-02],
       [ 1.77904976e-02,  1.93191671e-02,  4.87185258e-03,
        -4.56551405e-03, -3.38292690e-03,  3.83138780e-03,
         2.76225814e-02, -9.60905541e-03, -1.68732242e-03,
         4.50887253e-03,  5.65955682e-04,  9.51495436e-03],
       [ 1.07802402e-02,  1.20907518e-02,  9.64737161e-03,
        -3.67588892e-03, -6.26478287e-03,  3.40788500e-03,
         1.84049375e-02, -4.81309630e-03,  1.24608638e-03,
         1.99911624e-03, -6.95496710e-04, -1.20250884e-03],
       [ 5.00792405e-02,  2.78471469e-02,  3.05115978e-02,
        -7.52953828e-03, -1.95874935e-02,  4.51264172e-02,
         4.21716152e-02, -1.40583300e-02,  6.27337919e-03,
         2.82743269e-03, -1.00535749e-02, -1.50025272e-02],
       [ 1.68309989e-01,  6.46093956e-02,  7.8083017

#### Prediction

In [41]:
voteMatrix=np.dot(X_test,one_Vs_oneTheta.T)
voteMatrix.shape

(882, 21)

In [42]:
voteMatrix=sigmoid(voteMatrix)

In [43]:
for i in range(voteMatrix.shape[0]):
    class1=3
    class2=class1
    for j in range(voteMatrix.shape[1]):
        class2=class2+1
        if voteMatrix[i,j] >0.5:
             voteMatrix[i,j]=class1
        else:
            voteMatrix[i,j]=class2
        if(class2>9):
            class1=class1+1
            class2=class1
  

In [44]:
voteMatrix=np.array(voteMatrix)


In [45]:
vote={}
output=[]
for i in range(voteMatrix.shape[0]):
    uni,counts=np.unique(voteMatrix[i,:],return_counts=True)
    vote=dict(zip(uni,counts))
    output.append(list(vote.keys())[2])
    

In [46]:
output=np.matrix(output)
print(output.shape)
print(Y_test.shape)

(1, 882)
(882, 1)


In [52]:
one_Vs_oneTheta=learn_OneVsOne(datasets)
accuracy =np.count_nonzero(Y_test==output.T)/len(Y_test)*100
print(accuracy)

51.93197278911
