# Classification Algorithm on Iris datasets

## Import all necessary Packages

In [11]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import time

# to make this notebook's output stable across runs
np.random.seed(42)
import pandas as pd
# To plot pretty figures
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12


## Fetching data from disk

In [12]:
data=pd.read_csv('../../datasets/Assignment_data/Data_Q2/iris.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Preprocessing on datasets

### Encode categorical column it into numbers

In [13]:
data['species']=data['species'].astype('category').cat.codes
data1=pd.get_dummies(data,columns=['species'])
data['species']=data['species'].astype('float64')

### Filling null by median

In [14]:
columns=['sepal_length','sepal_width','petal_length','petal_width','species']

for i in columns:
    if(np.where(data.isnull()[i]==True)[0].shape!=(0,)):
        print(i)
        data[i]=data[i].fillna(data[i].median())


### Split data into train and test datasets

In [15]:
train=data.sample(frac=0.8,random_state=200)
test=data.drop(train.index)

trainy=train['species'].as_matrix()
y=trainy
trainy=trainy.reshape(trainy.shape[0],1)
a=time.clock()
#print(train)
mean=[]
#print(train[train['species']==2].mean().as_matrix())
for i in range(0,3):
    #print(i)
    mean.append(train[train['species']==i].mean().as_matrix())
    #print(mean)

std=[]
for i in range(0,3):
    #print(i)
    std.append(train[train['species']==i].std().as_matrix())

mean=np.array(mean)
std=np.array(std)

mean=np.delete(mean,mean.shape[1]-1,axis=1)
std=np.delete(std,std.shape[1]-1,axis=1)
b=time.clock()
traintime=b-a

trainx=train.drop(['species'],axis=1).as_matrix()

testy=test['species'].as_matrix()
testx=test.drop(['species'],axis=1).as_matrix()

y1=testy



## Various Classification Algorithms

### Nearest neighbour

In [16]:
#KNN
t=[]
print(testx.shape)
x=testx.reshape(testx.shape[0],trainx.shape[1])
for i in range(testx.shape[0]):
    x=testx[i].reshape(1,trainx.shape[1])
    dist = np.linalg.norm(trainx-x,axis=1)
    t.append(trainy[np.where(dist==dist.min())[0]][0])
t=np.array(t)
t=t[:,0]
accuracy=np.where(t==testy)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)
print("training Time=",0)

(30, 4)
Accuracy= 0.9333333333333333
training Time= 0


Calculating eucladian distance and apparently classifying points on basis of distance.

### Naive Bayes

In [17]:
# naive bayes



def prob(X,mean,std):
   
    z=np.square((X-mean)/std)/2
    
    return np.exp(-z)/(np.sqrt(2*np.pi)*std)


p1=np.where(trainy==0)[0].shape[0]/trainy.shape[0]
p2=np.where(trainy==1)[0].shape[0]/trainy.shape[0]
p3=np.where(trainy==2)[0].shape[0]/trainy.shape[0]

t=[]
for i in range(testx.shape[0]):
    q1=prob(testx[i],mean[0],std[0]).prod()
    q2=prob(testx[i],mean[1],std[1]).prod()
    q3=prob(testx[i],mean[2],std[2]).prod()
    t.append(np.argmax([p1*q1,p2*q2,q3*p3]))
t=np.array(t)


accuracy=np.where(t==testy)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)

print("training time=",traintime )

Accuracy= 0.8333333333333334
training time= 0.004152447196304365


Calculating the probability using bayes theorem and calculating the accuracy as well as training time.

### Logistic Regression (Library)

In [18]:
# using library logistic regression

from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
a=time.clock()
lg.fit(trainx,trainy)
b=time.clock()
t=lg.predict(testx)

accuracy=np.where(t==testy)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)
print("training time=",b-a)

  y = column_or_1d(y, warn=True)


Accuracy= 0.9
training time= 0.1775091238174582


### Convert categorical data into one hot vector

In [19]:
train=data1.sample(frac=0.8,random_state=200)
test=data1.drop(train.index)

trainy=train[['species_0','species_1','species_2']].as_matrix()

trainx=train.drop(['species_0','species_1','species_2'],axis=1).as_matrix()

testy=test[['species_0','species_1','species_2']].as_matrix()
testx=test.drop(['species_0','species_1','species_2'],axis=1).as_matrix()
print(trainy.shape)

(120, 3)


### Cross Entropy Loss and Gradient Descent Algorithm

In [20]:
#softmax cross entropy loss

W = 0.01 * np.random.randn(trainx.shape[1],3)
b = np.zeros((1,3))
X=trainx
y=y.astype('int64')
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]

a=time.clock()
for i in range(200):
  
  
    scores = np.dot(X, W) + b 
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) 
    corect_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W)
    loss = data_loss + reg_loss
    
  
 
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
  
 
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)
  
    dW += reg*W 

    W += -step_size * dW
    b += -step_size * db
b=time.clock()
scores = np.dot(testx, W) + b 
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) 
t=np.argmax(probs,axis=1)

accuracy=np.where(t==y1)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)
print("training time=",b-a)

Accuracy= 0.8333333333333334
training time= 0.03195438189965216


### Mean Square Loss by gradient descent algorithm

In [21]:
#gradient descent


W = 0.01 * np.random.randn(trainx.shape[1],3)
b = np.zeros((1,3))
X=trainx
Y=trainy
y=y.astype('int64')
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]
a=time.clock()
for i in range(1000):
  
  
    scores = np.dot(X, W) + b 
    probs = 1/(1+np.exp(-scores))
    
    dW = np.dot(X.T,(probs-Y))/X.shape[0]
    db=np.sum(probs-Y,axis=0)/X.shape[0]
    
    dW += reg*W 

    W += -step_size * dW
    b += -step_size * db

b=time.clock()
scores = np.dot(testx, W) + b 
exp_scores = np.exp(scores)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) 
t=np.argmax(probs,axis=1)


accuracy=np.where(t==y1)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)
print("training time=",b-a)

Accuracy= 0.6666666666666666
training time= 0.0381433796317765


### Newtons Method

In [51]:
# Newtons method

y=y.astype('int64')



X=np.concatenate((trainx,np.ones((X.shape[0],1))),axis=1)
Y=trainy

W = 0.01 * np.random.randn(X.shape[1],Y.shape[1])
 
q=np.zeros(X.shape[0])
a=time.clock()

scores = np.dot(X, W)
probs = 1/(1+np.exp(-scores))

for i in range(X.shape[0]):
    q[i]=probs[i,y[i]]

dW = np.dot(X.T,(probs-Y))/X.shape[0]


H=np.linalg.inv(np.dot(np.dot(X.T,np.diag((q*(1-q)))),X))

W += np.dot(H,dW)
   
b=time.clock()
xt=np.concatenate((testx,np.ones((testx.shape[0],1))),axis=1)
scores = np.dot(xt, W) 
probs = 1/(1+np.exp(-scores))

t=np.argmax(probs,axis=1)


accuracy=np.where(t==y1)[0].shape[0]/testy.shape[0]
print("Accuracy=",accuracy)

print("training time=",b-a)

Accuracy= 0.4
training time= 0.0006714813842449985
