In [136]:
from PIL import Image
import PIL.ImageOps
import numpy as np
import pandas as pd
import triangle as tr
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

#the directory of images, contains imeages of train and test set
dataDir = "/Users/chihchichou/Documents/CS/Data_Science/Leaf/Data/"


In [137]:
#Utility function to compute ratio of an image

def ComputeRatio(dataDir, imageNumber):

    '''
    dataDir : the data directory we saved the images
    
    imageNumber: integer
    
    '''
    
    # The gradient weights for L1 norm of gradient for each triangle type
    gradientsize = [ [np.sqrt(j) for j in range(3)]
                   , [np.sqrt(2 - j) for j in range(3)]
                   ]
    gradientsize = 0.5*np.array( gradientsize )


    # The weights for L2 norm of each triangle type. To be applied before
    # square root.
    L2size = [ [0.0, 1/12.0, 11/36.0]
             , [1.0/12.0, 1/4.0, 1/2.0]
             ]
    L2size = np.array(L2size)
    
    #read image into np array
    filename = dataDir + 'images/' + str(imageNumber) + '.jpg'
    myimage = Image.open(filename)
    myimage = myimage.convert('1')
    (imwidth, imheight) = myimage.size
    pixeldata = np.array(myimage.getdata(), dtype = 'uint')
    pixeldata = pixeldata.reshape(imheight, imwidth)
    pixeldata[pixeldata > 0] = 1
    
    #instance of triagle, computes counts
    counter = tr.TriangleCount(pixeldata)
    counter.getcounts(step = 1)
    totalgradient = np.sum( counter.counts * gradientsize ) 
    L2norm = np.sum( counter.counts * L2size)
    L2norm = np.sqrt( L2norm )
    ratio = totalgradient / L2norm
    
    return ratio



In [102]:
#reading train and test data into pandas data frame
trainData = pd.read_csv(dataDir+'train.csv')
#testData = pd.read_csv(dataDir+'test.csv')

In [105]:
#image index of training set
trainIndex = np.array(trainData['id'])

#compute the ratio of the images in the trainng set
ratio_X_train = [ComputeRatio(dataDir, x) for x in trainIndex]


In [132]:
#repalce species by numerical data
trainLabel = LabelEncoder().fit_transform(trainData["species"])
trainData['species'] = trainLabel

#adding column "ratio" into the data frame
trainData['ratio'] = pd.Series(ratio_X_train, index = trainData.index)

#we can check out the last column is the ratio
trainData.head()


Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64,ratio
0,1,3,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0,...,0.0,0.00293,0.00293,0.035156,0,0,0.004883,0.0,0.025391,5.481731
1,2,49,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0,...,0.0,0.0,0.000977,0.023438,0,0,0.000977,0.039062,0.022461,6.246674
2,3,65,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0,...,0.0,0.005859,0.000977,0.007812,0,0,0.0,0.020508,0.00293,5.665914
3,5,94,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0,...,0.000977,0.0,0.0,0.020508,0,0,0.017578,0.0,0.047852,5.766271
4,6,84,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0,...,0.0,0.021484,0.0,0.0,0,0,0.0,0.0,0.03125,5.942335


In [135]:
#seperate the trainData into X and Y

Y = trainData['species']
X = trainData.drop('species', axis = 1)
#X = trainData[['margin1','texture1','shape1', 'ratio']]


#devide the train data set into 70% train and 30% test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, \
                                                    random_state = 0)

#normalize set
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

#run perceptron
from sklearn.linear_model import Perceptron
iter_ = [40,50,60]
eta_ = 0.1
for i in iter_:
    ppn = Perceptron(n_iter = i, eta0 = eta_, random_state = 0 )
    ppn.fit(X_train_std, Y_train)
    Y_predict = ppn.predict(X_test_std)
    print "Missclassified ratio: %f" %(float((Y_test != Y_predict).sum())/\
                                      Y_test.shape[0])

Missclassified ratio: 0.146465
Missclassified ratio: 0.146465
Missclassified ratio: 0.146465
