## Packages for use

In [1]:
# basic use
from zipfile import ZipFile
from io import BytesIO
from PIL import Image  
import re
import logging
import csv
import pandas as pd
import numpy as np
from io import StringIO
import cv2   # need to do 'pip install opencv-python' in terminal under current conda environment
import matplotlib.pyplot as plt
import warnings

In [2]:
# feature section
import tensorflow as tf # need to do 'conda install -c conda-forge tensorflow' in terminal under current conda environment
import keras # need to do 'pip install keras' in terminal under current conda environment
from keras.applications.densenet import preprocess_input, DenseNet121

Using TensorFlow backend.


In [3]:
# modeling
from sklearn import svm
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsClassifier

## Explore the original zip file named petfinder-adoption-prediction.zip

In [4]:
# Create a ZipFile Object and load petfinder-adoption-prediction.zip in it
with ZipFile('petfinder-adoption-prediction.zip', 'r') as zipObj:
    
   # Get detail files infor 
   print('*** Detail info in an order of name, last modification time and name ***')
   zipObj.printdir()

*** Detail info in an order of name, last modification time and name ***
File Name                                             Modified             Size
train.zip                                      2019-04-10 21:21:10      2790732
state_labels.csv                               2019-04-10 21:21:10          285
test_sentiment.zip                             2019-04-10 21:21:10      3026657
train_metadata.zip                             2019-04-10 21:21:10     56196604
train_images.zip                               2019-04-10 21:21:12   1595336815
color_labels.csv                               2019-04-10 21:22:08           88
breed_labels.csv                               2019-04-10 21:22:08         6984
test.zip                                       2019-04-10 21:22:08       593271
test_images.zip                                2019-04-10 21:22:08    389440419
test_metadata.zip                              2019-04-10 21:22:24     13909287
train_sentiment.zip                            

#### Read in csv data

In [5]:
with ZipFile('petfinder-adoption-prediction.zip', 'r') as zipObj:
    
    with zipObj.open('state_labels.csv') as state_labels:
        state = pd.read_csv(state_labels) 
        print('*** state labels partial ***\n', state.head(),'\n')
        
    with zipObj.open('color_labels.csv') as color_labels:
        color = pd.read_csv(color_labels) 
        print('*** color labels partial ***\n', color.head(), '\n')
    
    with zipObj.open('breed_labels.csv') as breed_labels:
        breed = pd.read_csv(breed_labels) 
        print('*** breed labels partial ***\n', breed.head(), '\n')

*** state labels partial ***
    StateID     StateName
0    41336         Johor
1    41325         Kedah
2    41367      Kelantan
3    41401  Kuala Lumpur
4    41415        Labuan 

*** color labels partial ***
    ColorID ColorName
0        1     Black
1        2     Brown
2        3    Golden
3        4    Yellow
4        5     Cream 

*** breed labels partial ***
    BreedID  Type         BreedName
0        1     1     Affenpinscher
1        2     1      Afghan Hound
2        3     1  Airedale Terrier
3        4     1            Akbash
4        5     1             Akita 



In [6]:
with ZipFile("petfinder-adoption-prediction.zip", "r") as zipObj:
    zfiledata_train = BytesIO(zipObj.read('train.zip'))
    zfiledata_test = BytesIO(zipObj.read('test.zip'))
    
    print('\n****** Training data ******')
            
    with ZipFile(zfiledata_train) as zipObj_train:
        print('*** Files in train.zip ***\n', zipObj_train.namelist(),'\n\n*** Partial data from train.csv ***')
        
        with zipObj_train.open('train.csv') as train:
            traindata = pd.read_csv(train) 
            print(traindata.head())
            
    print('\n****** Testing data ******')
            
    with ZipFile(zfiledata_test) as zipObj_test:
        print('*** Files in test.zip ***\n', zipObj_test.namelist(),'\n\n*** Partial data from test.csv ***')
        
        with zipObj_test.open('test.csv') as test:
            testdata = pd.read_csv(test) 
            print(testdata.head())


****** Training data ******
*** Files in train.zip ***
 ['train.csv'] 

*** Partial data from train.csv ***
   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4     1       Hunter    1     307       0       1       1       0       0   

   MaturitySize  ...  Health  Quantity  Fee  State  \
0             1  ...       1         1  100  41326   
1             2  ...       1         1    0  41401   
2             2  ...       1         1    0  41326   
3             2  ...       1         1  150  41401   
4             2  ...       1         1    0  41326   

                          RescuerID  VideoAmt  \
0  8480853f516546f6cf33aa88cd76c379         0   
1  30

## Read in image data

In [7]:
with ZipFile("petfinder-adoption-prediction.zip", "r") as zipObj:
    train_image = BytesIO(zipObj.read('train_images.zip'))
    test_image = BytesIO(zipObj.read('test_images.zip'))
    
    print('\n****** Training image ******')
    with ZipFile(train_image) as zipObj_train_image:
        print('*** Number of files in train_images.zip ***\n', len(zipObj_train_image.namelist()),'\n\n')
        
        # print one image
        image_data_train = zipObj_train_image.open("de993d8ad-3.jpg")
        image_train = Image.open(image_data_train)
        #image_train.show()
        
        # show all file names
        # print('*** File names in train_images.zip ***\n', zipObj_train_image.namelist(),'\n\n')  # all .jpg files

        
    #print('\n****** Testing image ******')
    with ZipFile(test_image) as zipObj_test_image:
        print('*** Number of files in test_images.zip ***\n', len(zipObj_test_image.namelist()),'\n\n')
        
        # print one image
        image_data_test = zipObj_test_image.open("7d6a74b12-2.jpg")
        img_for_hist = cv2.imread('image.jpg')
        image_test = Image.open(image_data_test)
        #image_test.show()
        
        # show all file names
        print('*** File names in test_images.zip ***\n', zipObj_test_image.namelist(),'\n\n')  # all .jpg files


****** Training image ******
*** Number of files in train_images.zip ***
 58311 


*** Number of files in test_images.zip ***
 14465 


*** File names in test_images.zip ***
 ['c64b3227a-2.jpg', '16f35966b-7.jpg', '28d5ac2cd-3.jpg', '9affe6be5-2.jpg', 'e6e8038c4-1.jpg', 'a3297b927-1.jpg', 'ffc9bd892-1.jpg', 'ec366f165-2.jpg', 'db6c077d0-7.jpg', 'a4b2866af-6.jpg', '1ce3e4cd7-2.jpg', '30e0ceb28-1.jpg', 'fda872629-3.jpg', 'e5d8312cc-4.jpg', '35090cd14-8.jpg', '61dd5b10e-2.jpg', 'c503705b2-1.jpg', 'a331ef890-5.jpg', '9ab8c5dac-2.jpg', '7f3da34ba-1.jpg', '3c3b53478-5.jpg', '39d2cf194-5.jpg', 'a8463f72e-1.jpg', '57a0b3def-2.jpg', 'd7725e1b0-3.jpg', '5ef33a2b3-1.jpg', '2fa0b94b0-1.jpg', '6864737de-1.jpg', '1a84ca4a0-1.jpg', '7fe030a19-1.jpg', '56b369184-7.jpg', '54e8d5179-2.jpg', '74dcc54a6-15.jpg', '4dfd7f46f-1.jpg', '3d9801b54-4.jpg', 'fe43b61db-3.jpg', '0a9f45592-1.jpg', '6b8269c72-1.jpg', '0901d7d0c-2.jpg', '776177c6e-3.jpg', '1aa62fc18-1.jpg', '1a47e586c-3.jpg', 'de7731077-7.jpg', '0d74

#### The basic analysis of one image from test_images.zip

In [8]:
image_test.show()  

In [9]:
# obtain image infor
print('Type of the image : ' , image_test.format)
print('Mode of the image : ', image_test.mode)
width, height = image_test.size
print('Image Height : ', height)
print('Image Width : ', width)

Type of the image :  JPEG
Mode of the image :  RGB
Image Height :  360
Image Width :  203


In [10]:
# obtain the pixel values
# getdata() scans the image horizontally from left to right starting at the top-left corner
# the values got from each pixel is then added into a list
# (R,G,B)
pix_val = list(image_test.getdata())
#len(pix_val)  # 73080 pixels in total
pix_val

[(46, 54, 56),
 (63, 68, 71),
 (80, 84, 85),
 (87, 89, 88),
 (80, 81, 76),
 (69, 68, 63),
 (64, 61, 54),
 (61, 58, 51),
 (53, 50, 43),
 (122, 122, 114),
 (71, 72, 66),
 (122, 127, 121),
 (135, 141, 139),
 (148, 157, 156),
 (164, 174, 175),
 (156, 168, 168),
 (168, 179, 175),
 (168, 179, 175),
 (141, 152, 148),
 (169, 180, 176),
 (168, 177, 174),
 (152, 161, 158),
 (173, 182, 179),
 (149, 159, 158),
 (133, 145, 143),
 (106, 121, 118),
 (88, 104, 101),
 (89, 108, 104),
 (94, 116, 113),
 (100, 125, 121),
 (121, 148, 143),
 (146, 173, 168),
 (146, 168, 179),
 (152, 172, 179),
 (153, 167, 170),
 (149, 158, 155),
 (154, 156, 151),
 (164, 165, 159),
 (166, 168, 163),
 (157, 166, 165),
 (145, 160, 165),
 (88, 110, 123),
 (60, 89, 107),
 (71, 106, 128),
 (56, 92, 116),
 (67, 104, 130),
 (62, 98, 124),
 (67, 103, 129),
 (50, 106, 123),
 (54, 107, 123),
 (57, 108, 125),
 (60, 109, 124),
 (66, 112, 127),
 (80, 122, 136),
 (99, 137, 150),
 (114, 148, 158),
 (118, 148, 158),
 (125, 150, 157),
 (131,

In [11]:
pix_val_flat = [x for sets in pix_val for x in sets]
pix_val_flat

[46,
 54,
 56,
 63,
 68,
 71,
 80,
 84,
 85,
 87,
 89,
 88,
 80,
 81,
 76,
 69,
 68,
 63,
 64,
 61,
 54,
 61,
 58,
 51,
 53,
 50,
 43,
 122,
 122,
 114,
 71,
 72,
 66,
 122,
 127,
 121,
 135,
 141,
 139,
 148,
 157,
 156,
 164,
 174,
 175,
 156,
 168,
 168,
 168,
 179,
 175,
 168,
 179,
 175,
 141,
 152,
 148,
 169,
 180,
 176,
 168,
 177,
 174,
 152,
 161,
 158,
 173,
 182,
 179,
 149,
 159,
 158,
 133,
 145,
 143,
 106,
 121,
 118,
 88,
 104,
 101,
 89,
 108,
 104,
 94,
 116,
 113,
 100,
 125,
 121,
 121,
 148,
 143,
 146,
 173,
 168,
 146,
 168,
 179,
 152,
 172,
 179,
 153,
 167,
 170,
 149,
 158,
 155,
 154,
 156,
 151,
 164,
 165,
 159,
 166,
 168,
 163,
 157,
 166,
 165,
 145,
 160,
 165,
 88,
 110,
 123,
 60,
 89,
 107,
 71,
 106,
 128,
 56,
 92,
 116,
 67,
 104,
 130,
 62,
 98,
 124,
 67,
 103,
 129,
 50,
 106,
 123,
 54,
 107,
 123,
 57,
 108,
 125,
 60,
 109,
 124,
 66,
 112,
 127,
 80,
 122,
 136,
 99,
 137,
 150,
 114,
 148,
 158,
 118,
 148,
 158,
 125,
 150,
 157,
 131,


In [12]:
# returns a histogram for the image as a list of pixel counts, one for each pixel value in the source image
# If the image has more than one band, the histograms for all bands are concatenated 
# (for example, the histogram for an “RGB” image contains 768 values)
# len(image_test.histogram) # returns 768
pix_hist = image_test.histogram()

## Machine Learning Approach （use deep learning outputs from Chengchen's code）
#### Read in new image data after feature selection

In [13]:
train_img = pd.read_csv("/Users/liliya/Desktop/Spring Quarter 2019/STA 160/train_img_features.csv")
test_img = pd.read_csv("/Users/liliya/Desktop/Spring Quarter 2019/STA 160/test_img_features.csv")
print(train_img.head()) 

  Unnamed: 0         0         1         2         3         4         5  \
0  86e1089a3  0.002000  0.167800  0.019715  0.015896  0.068162  0.002216   
1  6296e909a  0.002858  0.107450  0.019916  0.023482  0.174765  0.002297   
2  3422e4906  0.002734  0.072015  0.024455  0.018021  0.154207  0.001946   
3  5842f1ff5  0.002106  0.274519  0.054815  0.013727  0.089969  0.001650   
4  850a43f90  0.002185  0.174022  0.044818  0.016244  0.169775  0.002075   

          6         7         8  ...       246       247       248       249  \
0  0.005042  0.004828  0.050760  ...  0.787699  0.176625  0.575706  1.088627   
1  0.005031  0.006338  0.083378  ...  0.628260  0.686865  0.563999  0.968190   
2  0.004211  0.001576  0.100046  ...  0.579116  0.557624  1.131405  0.720514   
3  0.005506  0.004295  0.118727  ...  1.295853  0.326143  0.291668  1.608086   
4  0.004421  0.004157  0.099671  ...  1.092663  0.669893  0.395784  0.886075   

        250       251       252       253       254       255 

In [14]:
# notice that the first column above is the petid
np.array_equal(train_img.loc[:, 'Unnamed: 0'], traindata.loc[:, 'PetID'])  # returns True

# we then obtain the adoption speed column from the traindata df and add it to the train_img df
train_img['AdoptionSpeed'] = traindata.loc[:, 'AdoptionSpeed']
print(train_img.head()) 

  Unnamed: 0         0         1         2         3         4         5  \
0  86e1089a3  0.002000  0.167800  0.019715  0.015896  0.068162  0.002216   
1  6296e909a  0.002858  0.107450  0.019916  0.023482  0.174765  0.002297   
2  3422e4906  0.002734  0.072015  0.024455  0.018021  0.154207  0.001946   
3  5842f1ff5  0.002106  0.274519  0.054815  0.013727  0.089969  0.001650   
4  850a43f90  0.002185  0.174022  0.044818  0.016244  0.169775  0.002075   

          6         7         8  ...       247       248       249       250  \
0  0.005042  0.004828  0.050760  ...  0.176625  0.575706  1.088627  0.439557   
1  0.005031  0.006338  0.083378  ...  0.686865  0.563999  0.968190  1.070276   
2  0.004211  0.001576  0.100046  ...  0.557624  1.131405  0.720514  1.496672   
3  0.005506  0.004295  0.118727  ...  0.326143  0.291668  1.608086  1.119176   
4  0.004421  0.004157  0.099671  ...  0.669893  0.395784  0.886075  1.219730   

        251       252       253       254       255  AdoptionS

In [15]:
# rename the target dataframe so that it won't conflict with training set and testing set naming in later use
data = train_img

In [16]:
# check the dimension of data and col names
print(data.columns.values)
print(data.shape)

['Unnamed: 0' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13'
 '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27'
 '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55'
 '56' '57' '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69'
 '70' '71' '72' '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83'
 '84' '85' '86' '87' '88' '89' '90' '91' '92' '93' '94' '95' '96' '97'
 '98' '99' '100' '101' '102' '103' '104' '105' '106' '107' '108' '109'
 '110' '111' '112' '113' '114' '115' '116' '117' '118' '119' '120' '121'
 '122' '123' '124' '125' '126' '127' '128' '129' '130' '131' '132' '133'
 '134' '135' '136' '137' '138' '139' '140' '141' '142' '143' '144' '145'
 '146' '147' '148' '149' '150' '151' '152' '153' '154' '155' '156' '157'
 '158' '159' '160' '161' '162' '163' '164' '165' '166' '167' '168' '169'
 '170' '171' '172' '173' '174' '175' '176' '177' '178' '179' '18

In [17]:
data.isnull().values.any() # there is no NaN in the dataframe

False

#### outlier detection

In [18]:
# first, separate data columns into X-features and Y-output
x = data.iloc[:,1:256]     # responsible variable
y = data.iloc[:,257]     # predicting variable

In [19]:
# partial code adopted from Liya Li's STA141B and ECS171 final projects

warnings.filterwarnings('ignore')

#SVM 
od1 = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
od1.fit(x)
ySVM = od1.predict(x)
outlierSVM = ySVM[ySVM == -1].size
print("Number of outliers detected with One Class SVM: " + str(outlierSVM))
    
#LOF 
od2 = LocalOutlierFactor(n_neighbors = 10, contamination = 0.1)
yLOF = od2.fit_predict(x)
outlierLOF = yLOF[yLOF == -1].size
print("Number of outliers detected with LOF: " + str(outlierLOF))

#Isolation Forest 
rng = np.random.RandomState(42)
od3 = IsolationForest(max_samples=100, random_state=rng, contamination=0.1)
od3.fit(x)
yIF = od3.predict(x)
outlierIF = yIF[yIF == -1].size
print("Number of outliers detected with : " + str(outlierIF))
    
#initializing lists to store indices of outliers
indexSVM = []
indexLOF = []
indexIF = []

#storing the indices of the outliers
for i in range(0, len(x)):
    if ySVM[i] == -1:
        indexSVM.append(i)
    if yLOF[i] == -1:
        indexLOF.append(i)
    if yIF[i] == -1:
        indexIF.append(i)

#finds the intersection of all 3 outlier methods
outliers = sorted(set(indexIF).intersection((set(indexSVM).intersection(indexLOF))))
print("")
print("Number of outliers that detected with SVM and LOF and IF in common is: " + str(len(sorted(set(indexIF).intersection((set(indexSVM).intersection(indexLOF)))))))
print("Their indexes are:")
print(outliers)

Number of outliers detected with One Class SVM: 1459
Number of outliers detected with LOF: 1500
Number of outliers detected with : 1500

Number of outliers that detected with SVM and LOF and IF in common is: 438
Their indexes are:
[19, 29, 49, 61, 70, 91, 124, 139, 145, 166, 179, 227, 301, 331, 337, 407, 425, 488, 497, 545, 568, 579, 675, 793, 802, 820, 870, 895, 909, 967, 970, 1057, 1066, 1069, 1072, 1076, 1114, 1137, 1146, 1183, 1202, 1235, 1247, 1303, 1321, 1338, 1359, 1404, 1421, 1425, 1449, 1470, 1488, 1522, 1539, 1547, 1573, 1578, 1657, 1669, 1699, 1709, 1724, 1729, 1742, 1771, 1788, 1790, 1807, 1828, 1843, 1849, 1910, 1920, 1944, 2084, 2126, 2152, 2214, 2253, 2267, 2292, 2300, 2313, 2324, 2328, 2350, 2403, 2436, 2473, 2527, 2610, 2675, 2685, 2706, 2718, 2796, 2811, 2924, 2927, 2929, 2958, 3004, 3044, 3108, 3119, 3143, 3279, 3294, 3315, 3415, 3421, 3448, 3507, 3521, 3618, 3620, 3652, 3786, 3818, 3821, 3851, 3862, 3868, 3869, 4061, 4086, 4096, 4118, 4128, 4228, 4255, 4258, 4260, 4

In [20]:
# delete outliers
newdata_no_outliers = data.drop(outliers)
newdata_no_outliers.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,247,248,249,250,251,252,253,254,255,AdoptionSpeed
0,86e1089a3,0.002,0.1678,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.05076,...,0.176625,0.575706,1.088627,0.439557,0.52046,1.547071,0.832572,0.599095,0.763349,2
1,6296e909a,0.002858,0.10745,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,...,0.686865,0.563999,0.96819,1.070276,1.545739,0.894411,0.838595,0.468236,0.916672,0
2,3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,...,0.557624,1.131405,0.720514,1.496672,0.870955,1.289682,1.184461,0.465113,0.892826,3
3,5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.00165,0.005506,0.004295,0.118727,...,0.326143,0.291668,1.608086,1.119176,1.470888,0.591445,0.832753,0.483021,1.134128,2
4,850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,...,0.669893,0.395784,0.886075,1.21973,1.033964,1.065685,0.304053,0.438069,0.676818,2


#### normalization

In [21]:
# apply normalization to x and name it Xnew
xnew = np.asarray(newdata_no_outliers.drop("Unnamed: 0", axis=1))
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(xnew)

X = pd.DataFrame(np_scaled)
Y = np.asarray(newdata_no_outliers["AdoptionSpeed"])
print(X.shape)
print(Y.shape)

(14555, 257)
(14555,)


#### model selection

In [22]:
# apply cross validation
# split normalized no outlier data into trainning and testing sets
validation_size = 0.20
seed = 10
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = validation_size, random_state = seed)

In [23]:
# linear regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
print("Linear Regression: ")
regr = linear_model.LinearRegression().fit(X_train, Y_train)

# training error and accuracy
Y_pred = np.round(regr.predict(X_train))
print("MSE: " + str(mean_squared_error(Y_train, Y_pred)))

# testing error and accuracy
Y_pred = np.round(regr.predict(X_test))
print("MSE: " + str(mean_squared_error(Y_test, Y_pred)))  

Linear Regression: 
MSE: 0.0
MSE: 0.0


In [24]:
# polynomial regression
#from sklearn.preprocessing import PolynomialFeatures

# Create poly regression
#print("Polynomial Regression: ")
#poly = PolynomialFeatures(degree=2)
#X_train = poly.fit_transform(X_train)
#X_test = poly.fit_transform(X_test)
#regr = linear_model.LinearRegression().fit(X_train, Y_train)

# training error and accuracy
#Y_pred = np.round(regr.predict(X_train))
#print("MSE: " + str(mean_squared_error(Y_train, Y_pred)))

# testing error and accuracy
#Y_pred = np.round(regr.predict(X_test))
#print("MSE: " + str(mean_squared_error(Y_test, Y_pred)))

# doesn't work out due to the forever run-time

In [25]:
# logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear', multi_class='ovr').fit(X_train, Y_train.ravel())
print("Logistic Regression: ")
print("Training Set Accuracy: " + str(clf.score(X_train, Y_train)))
print("Testing Set Accuracy: " + str(clf.score(X_test, Y_test)))

Logistic Regression: 
Training Set Accuracy: 0.8928203366540708
Testing Set Accuracy: 0.8639642734455514


In [26]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
warnings.filterwarnings('ignore')

clf = LinearDiscriminantAnalysis().fit(X_train, Y_train.ravel())
print("Training Set Accuracy: " + str(clf.score(X_train, Y_train)))
print("Testing Set Accuracy: " + str(clf.score(X_test, Y_test)))

Training Set Accuracy: 0.4081071796633459
Testing Set Accuracy: 0.365166609412573


In [27]:
# SVM
from sklearn.svm import SVC

clf = svm.SVC(kernel='linear', C=1).fit(X_train, Y_train)
print("Training Set Accuracy: " + str(clf.score(X_train, Y_train)))
print("Testing Set Accuracy: " + str(clf.score(X_test, Y_test)))

Training Set Accuracy: 1.0
Testing Set Accuracy: 1.0


In [28]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=0).fit(X_train, Y_train)
print("Training Set Accuracy: " + str(clf.score(X_train, Y_train)))
print("Testing Set Accuracy: " + str(clf.score(X_test, Y_test)))

Training Set Accuracy: 1.0
Testing Set Accuracy: 0.917554105118516


In [29]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_leaf_nodes=20, random_state=0).fit(X_train, Y_train)
print("Training Set Accuracy: " + str(clf.score(X_train, Y_train)))
print("Testing Set Accuracy: " + str(clf.score(X_test, Y_test)))

Training Set Accuracy: 1.0
Testing Set Accuracy: 1.0
