# Machine Learning Engineer Nanodegree


## Project: Yelp Restaurant Photo Classification


### Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import sys

import caffe
import os
import time

### Load the Caffe Pre-trained Reference Model

In [2]:
# Define caffe and dataset root
caffe_root = '/home/ubuntu/src/caffe/'
dataset_root = '/home/ubuntu/yelp_classification/data/'

test_dataset_root = '/home/ubuntu/'

# Using python, so insert python into caffe root
sys.path.insert(0, caffe_root + 'python')

# Check to see if CaffeNet is already downloaded, otherwise download it
if os.path.isfile(caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'):
    print('CaffeNet found.')
else:
    print('Downloading pre-trained CaffeNet model...')
    !/home/ubuntu/src/caffe/scripts/download_model_binary.py /home/ubuntu/src/caffe/models/bvlc_reference_caffenet

CaffeNet found.


In [3]:
## Use GPU    
caffe.set_device(0)
caffe.set_mode_gpu()

### Get Image features from the second last layer ('fc7') of CaffeNet

In [4]:
def load_features(image_list, layer = 'fc7'):
    
    #######################################################################################
    # Reference: https://github.com/BVLC/caffe/blob/master/examples/00-classification.ipynb
    #######################################################################################

    # BVLC Caffenet model definition (layers etc)
    proto_file = caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
    
    # BVLC Caffenet learned model weights
    caffemodel = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'

    # Create a caffe 'Net'
    caffe_net = caffe.Net(proto_file, caffemodel, caffe.TEST)
    
    
    
    ## Pre-process the input images in form Caffe expects
    
    # Mean from image net
    mean_imagenet = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
    
    # average over pixels to obtain the mean (BGR) pixel values
    mean = mean_imagenet.mean(1).mean(1)
    
    
    # Create a transformer for loading images in form of caffenet and name it 'data'
    
    # By default the caffe transformer returns data shape as (10, 3, 227, 227).
    # This is because 10 random 227x227 crops are supposed to be extracted from a 256x256 image 
    # and passed through the net.
    transformer = caffe.io.Transformer({'data': caffe_net.blobs['data'].data.shape})
    
    # Transform image channels, input is (HxWxC) while caffe expects (CxHxW)
    transformer.set_transpose('data', (2,0,1))

    # subtract the dataset-mean value in each channel
    transformer.set_mean('data', mean)
    
    # rescale image data from [0, 1] to [0, 255] as caffe operates on images in range [0, 255] 
    transformer.set_raw_scale('data', 255)
    
    # Caffe expects images in BGR format while input is in RGB format, so swap it
    transformer.set_channel_swap('data', (2,1,0))

    
    
    total_images = len(image_list)
    
    # set the size of the input (batch_size, channel, height, width)
    caffe_net.blobs['data'].reshape(total_images, 3, 227, 227)
    
    # Load the images and transform them and save it in memory
    caffe_net.blobs['data'].data[...] = map(lambda img: transformer.preprocess('data', caffe.io.load_image(img)), image_list)
    
    ### perform classification using BVLC reference model 'caffe_net' created earlier
    out = caffe_net.forward()

    # Return the 'fc7' layer weights features
    return caffe_net.blobs[layer].data

### Extract features from training images and save it

In [36]:
# Save the extracted features for both test and train images for later use using H5PY library

# Read and close file first
h5py_train_file = h5py.File(data_loc + 'train_images_fc7_features.h5', 'r+')
h5py_train_file.close()



# Open a h5py file for writing training features extracted using bvlc reference model
h5py_train_file = h5py.File(data_loc + 'train_images_fc7_features.h5','w')

# Create dataset for image name
h5py_train_img_name = h5py_train_file.create_dataset('image_name', (0,), maxshape=(None,), dtype='|S54')

# Create dataset for image features
# Save features returned from bvlc fc7 layer (4096 features) to h5py file
h5py_train_img_feature = h5py_train_file.create_dataset('image_feature', (0,4096), maxshape = (None,4096))

h5py_train_file.close()

print("Created Train image features file!")


Created Train image features file!


In [37]:
## Load mapping of Training Photos to Business IDs
train_photos_to_business_id = pd.read_csv(data_loc + 'train_photo_to_biz_ids.csv')


# Get the Image full path using photo list from csv file
train_images = [os.path.join(data_loc + 'train_photos/', str(x) + '.jpg') for x in train_photos_to_business_id['photo_id']]

total_train_images = len(train_images)
print("Total training images: {}".format(total_train_images))


Total training images: 234842


In [38]:
# Batch size to process image features
batch_size = 500

# Get the features for training Images
for count in range(0, total_train_images, batch_size):
    
    # Get the image path for this batch of images
    image_path = train_images[count: min((count + batch_size), total_train_images)]
    
    # Get features for all batch images
    train_img_features = load_features(image_path, layer='fc7')
    
    total_processed = count + train_img_features.shape[0]

    
    # Open the h5 file to store the image features for future use
    h5py_train_file = h5py.File(data_loc + 'train_images_fc7_features.h5','r+')
    
    h5py_train_file['image_name'].resize((total_processed,))
    h5py_train_file['image_name'][count : total_processed] = np.array(image_path)
    
    h5py_train_file['image_feature'].resize((total_processed, train_img_features.shape[1]))
    h5py_train_file['image_feature'][count : total_processed, :] = train_img_features
    h5py_train_file.close()

    if (total_processed % 10000) == 0 or total_processed == total_train_images:
        print("Total images processed: {}".format(total_processed))

Total images processed: 10000
Total images processed: 20000
Total images processed: 30000
Total images processed: 40000
Total images processed: 50000
Total images processed: 60000
Total images processed: 70000
Total images processed: 80000
Total images processed: 90000
Total images processed: 100000
Total images processed: 110000
Total images processed: 120000
Total images processed: 130000
Total images processed: 140000
Total images processed: 150000
Total images processed: 160000
Total images processed: 170000
Total images processed: 180000
Total images processed: 190000
Total images processed: 200000
Total images processed: 210000
Total images processed: 220000
Total images processed: 230000
Total images processed: 234842


### Test training features extracted correctly

In [41]:
# Test features loaded correctly in h5py file
h5py_train_file = h5py.File(data_loc + 'train_images_fc7_features.h5','r')


print('File: train_images_fc7_features.h5:')
for key in h5py_train_file.keys():
    print key, h5py_train_file[key].shape
    
print("A photo: {}".format(h5py_train_file['image_name'][0]))

print("Its feature vector: {}".format(h5py_train_file['image_feature'][0][0:5]))

h5py_train_file.close()

 train_images_fc7_features.h5:
image_feature (234842, 4096)
image_name (234842,)

A photo: /home/ubuntu/yelp_classification/data/train_photos/204
Its feature vector (first 10-dim):  [ 0.          0.          0.          0.          0.          0.10489184
  0.          0.          0.          0.        ]  ...


### Extract features from Testing images and save it

In [5]:
# Save the extracted features for both test and train images for later use using H5PY library

# Read and close file first
h5py_test_file = h5py.File(dataset_root + 'test_images_fc7_features.h5', 'r+')
h5py_test_file.close()



# Open a h5py file for writing testing features extracted using bvlc reference model
h5py_test_file = h5py.File(dataset_root + 'test_images_fc7_features.h5','w')

# Create dataset for image name
h5py_test_img_name = h5py_test_file.create_dataset('image_name', (0,), maxshape=(None,), dtype='|S54')

# Create dataset for image features
# Save features returned from bvlc fc7 layer (4096 features) to h5py file
h5py_test_img_feature = h5py_test_file.create_dataset('image_feature', (0,4096), maxshape = (None,4096))

h5py_test_file.close()



print("Created Test image features file!")


Created Test image features file!


In [6]:
## Load mapping of Testing Photos to Business IDs
test_photos_to_business_id = pd.read_csv(dataset_root + 'test_photo_to_biz.csv')
print("Total test photos to business id: {}".format(len(test_photos_to_business_id)))

print("Total unique test photos: {}".format(len(test_photos_to_business_id['photo_id'].unique())))


# Get the Image full path using photo list from csv file
test_images = [os.path.join(test_dataset_root + 'test_photos/', str(x) + '.jpg') for x in test_photos_to_business_id['photo_id'].unique()]

total_test_images = len(test_images)
print("Total test images: {}".format(total_test_images))


Total test photos to business id: 1190225
Total unique test photos: 237152
Total test images: 237152


In [8]:
for test_img in test_images:
    if "317818" in test_img:
        print('Found: {}'.format(test_img))

Found: /home/ubuntu/test_photos/317818.jpg


In [9]:
# Read and close file first
#h5py_test_file = h5py.File(dataset_root + 'testing_h5py_file.h5', 'r+')
#h5py_test_file.close()



# Open a h5py file for writing testing features extracted using bvlc reference model
#h5py_test_file = h5py.File(dataset_root + 'testing_h5py_file.h5','w')

# Create dataset for image name
#h5py_test_img_name = h5py_test_file.create_dataset('image_name', (0,), maxshape=(None,), dtype='|S54')

# Create dataset for image features
# Save features returned from bvlc fc7 layer (4096 features) to h5py file
#h5py_test_img_feature = h5py_test_file.create_dataset('image_feature', (0,4096), maxshape = (None,4096))

#h5py_test_file.close()



#print("Created Test image features file!")

In [10]:




#test_images = ['/home/ubuntu/test_photos/317818.jpg', '/home/ubuntu/test_photos/306715.jpg']
#total_test_images = len(test_images)
#print("Total test images: {}".format(total_test_images))


# Batch size to process image features
#batch_size = 500

# Get the features for testing Images
#for count in range(0, (batch_size+1), batch_size):
    
    #print('\nCount: {}'.format(count))
    # Get the image path for this batch of images
    #image_path = test_images[count: min((count + batch_size), total_test_images)]
    
    # Get features for all batch images
    #test_img_features = load_features(image_path, layer='fc7')
    
    #for img in image_path:
    #    if "317818" in img:
    #        print('##################################### Found: {}'.format(img))
        
    #total_processed = (count + test_img_features.shape[0])

    #print("***************************************************************")
    #print('Count: {}'.format(count))
    #print("Total: {}".format(total_processed))
    #print("Feature shape: {}".format(test_img_features.shape))
    
    # Open the h5 file to store the image features for future use
    #h5py_test_file = h5py.File(dataset_root + 'testing_h5py_file.h5','r+')
    
    #h5py_test_file['image_name'].resize((total_processed,))
    #h5py_test_file['image_name'][count : total_processed] = np.array(image_path)
    
    #print("H5PY file image name: {}".format(h5py_test_file['image_name'][count : total_processed]))
    
    #h5py_test_file.close()


    #if (total_processed % 1000) == 0 or total_processed == total_test_images:
    #    print("Total test images processed: {}".format(total_processed))
        
        
#print('Done!!')   
        


In [11]:

# Batch size to process image features
batch_size = 500

# Get the features for testing Images
for count in range(0, total_test_images, batch_size):
    
    # Get the image path for this batch of images
    image_path = test_images[count: min((count + batch_size), total_test_images)]
    
    # Get features for all batch images
    test_img_features = load_features(image_path, layer='fc7')
    
    total_processed = (count + test_img_features.shape[0])

    
    # Open the h5 file to store the image features for future use
    h5py_test_file = h5py.File(dataset_root + 'test_images_fc7_features.h5','r+')
    
    h5py_test_file['image_name'].resize((total_processed,))
    h5py_test_file['image_name'][count : total_processed] = np.array(image_path)
    
    h5py_test_file['image_feature'].resize((total_processed, test_img_features.shape[1]))
    h5py_test_file['image_feature'][count : total_processed, :] = test_img_features
    h5py_test_file.close()

    if (total_processed % 1000) == 0 or total_processed == total_test_images:
        print("Total test images processed: {}".format(total_processed))
        
print('Done!!')

  warn("The default mode, 'constant', will be changed to 'reflect' in "


Total test images processed: 1000
Total test images processed: 2000
Total test images processed: 3000
Total test images processed: 4000
Total test images processed: 5000
Total test images processed: 6000
Total test images processed: 7000
Total test images processed: 8000
Total test images processed: 9000
Total test images processed: 10000
Total test images processed: 11000
Total test images processed: 12000
Total test images processed: 13000
Total test images processed: 14000
Total test images processed: 15000
Total test images processed: 16000
Total test images processed: 17000
Total test images processed: 18000
Total test images processed: 19000
Total test images processed: 20000
Total test images processed: 21000
Total test images processed: 22000
Total test images processed: 23000
Total test images processed: 24000
Total test images processed: 25000
Total test images processed: 26000
Total test images processed: 27000
Total test images processed: 28000
Total test images processed: 

Total test images processed: 232000
Total test images processed: 233000
Total test images processed: 234000
Total test images processed: 235000
Total test images processed: 236000
Total test images processed: 237000
Total test images processed: 237152
Done!!


### Test testing features extracted correctly

In [12]:
# Test features loaded correctly in h5py file
h5py_test_file = h5py.File(dataset_root + 'test_images_fc7_features.h5','r')

print('File: test_images_fc7_features.h5:')
for key in h5py_test_file.keys():
    print key, h5py_test_file[key].shape
    
print("A photo: {}".format(h5py_test_file['image_name'][0]))

print("Its feature vector: {}".format(h5py_test_file['image_feature'][0][0:15]))

h5py_test_file.close()

File: test_images_fc7_features.h5:
image_feature (237152, 4096)
image_name (237152,)
A photo: /home/ubuntu/test_photos/317818.jpg
Its feature vector: [ 0.          0.          0.          0.          0.          4.29739428
  3.13467836  0.227485    0.          0.          0.          0.          0.
  0.          0.        ]


### Associate train image features to each business

In [13]:
## Load mapping of Training Photos to Business IDs
train_photos_to_business_id = pd.read_csv(dataset_root + 'train_photo_to_biz_ids.csv')
#print(train_photos_to_business_id[:5])


# Dropping business with no labels
y_training_labels = pd.read_csv(dataset_root + 'train.csv').dropna()
#print(training_labels[:5])


# Convert space delimitted labels into tuples
y_training_labels['labels'] = y_training_labels['labels'].apply(lambda labels: tuple(sorted(int(label) for label in labels.split())))
#print(training_labels[:5])


# Set the main index of labels to business_id
y_training_labels.set_index('business_id', inplace=True)
#print("Training data after setting index to business id")
#print(training_labels[:5])


# Get the unique businesses in train file
train_business_ids = y_training_labels.index.unique()
#print("Business id")
#print(business_ids[:5])
#print('Total businesses: {}'.format(len(business_ids)))



# Load the training features from h5 file
X_train_features_file = h5py.File(dataset_root + 'train_images_fc7_features.h5', 'r')
X_train_features = np.copy(X_train_features_file['image_feature'])
X_train_features_file.close()



# Create a new DataFrame to store business, their features and labels into file
train_data_frame = pd.DataFrame(columns=['business_id', 'label', 'features'])

print('Done!')

Done!


In [14]:

count = 0

for business in train_business_ids:
    if count < 1:
        print('Business: {}'.format(business))
    
    
    label_from_business = y_training_labels.loc[business]['labels']
    if count < 1:
        print('Labels from business: {}'.format(label_from_business))
        
    business_list = train_photos_to_business_id[train_photos_to_business_id['business_id'] == business]
    if count < 1:
        print('business_list:\n')
        print(business_list)
        
    business_list_index = train_photos_to_business_id[train_photos_to_business_id['business_id'] == business].index
    if count < 1:
        print('business_list_index:\n')
        print(business_list_index)
    
    image_list = train_photos_to_business_id[train_photos_to_business_id['business_id'] == business].index.tolist()
    if count < 1:
        print('Image_list:\n')
        print(image_list)
        
    feature_list = X_train_features[image_list]
    if count < 1:
        print('feature_list:\n')
        print(feature_list)
        
    mean_feature = list(np.mean(feature_list, axis=0))
    if count < 1:
        print('mean_feature:\n')
        print(mean_feature)
        
    train_data_frame.loc[count] = [business, label_from_business, mean_feature]
    
    count += 1
    
    if (count == len(train_business_ids)):
        print('Done pass: {}'.format(count))

        
# Store train data frame to file
with open(dataset_root + "train_business_label_fc7_features.csv", 'w') as f:  
    train_data_frame.to_csv(f, index=False)


Business: 1000
Labels from business: (1, 2, 3, 4, 5, 6, 7)
business_list:

        photo_id  business_id
11057     438623         1000
18543     325966         1000
18544     227692         1000
18545     407856         1000
18546     368729         1000
18547     163193         1000
29232     287807         1000
29233     360032         1000
29234      31118         1000
29235     225910         1000
29237     223113         1000
52957      14033         1000
52958      84772         1000
52959      94438         1000
86633      68029         1000
94718     253623         1000
111849    366693         1000
113042    291366         1000
113043    449614         1000
113108    358271         1000
113109    340255         1000
113110    260929         1000
119222     15749         1000
119223     86549         1000
139779    402724         1000
139780    147584         1000
139781    322375         1000
139790     93860         1000
140469    342004         1000
149462    458240         

Done pass: 1996
First five features
   business_id                  label  \
0         1000  (1, 2, 3, 4, 5, 6, 7)   
1         1001           (0, 1, 6, 8)   
2          100     (1, 2, 4, 5, 6, 7)   
3         1006        (1, 2, 4, 5, 6)   
4         1010              (0, 6, 8)   

                                            features  
0  [0.19977085, 0.43287092, 0.22732987, 0.3551694...  
1  [0.0, 0.58893245, 0.53906047, 0.17221628, 0.01...  
2  [0.11155061, 0.034822084, 0.12025276, 0.520122...  
3  [0.078059338, 0.054452635, 0.05638162, 0.69423...  
4  [0.39657032, 0.27962369, 0.0, 0.17205141, 0.36...  
Total length of features
1996


In [4]:
# Test if the features saved correctly
train_feature_csv = pd.read_csv(dataset_root + 'train_business_label_fc7_features.csv')

print('First five features')
print(train_feature_csv[:5])
print('Total length of features')
print(len(train_feature_csv))

First five features
   business_id                  label  \
0         1000  (1, 2, 3, 4, 5, 6, 7)   
1         1001           (0, 1, 6, 8)   
2          100     (1, 2, 4, 5, 6, 7)   
3         1006        (1, 2, 4, 5, 6)   
4         1010              (0, 6, 8)   

                                            features  
0  [0.19977085, 0.43287092, 0.22732987, 0.3551694...  
1  [0.0, 0.58893245, 0.53906047, 0.17221628, 0.01...  
2  [0.11155061, 0.034822084, 0.12025276, 0.520122...  
3  [0.078059338, 0.054452635, 0.05638162, 0.69423...  
4  [0.39657032, 0.27962369, 0.0, 0.17205141, 0.36...  
Total length of features
1996


### Associate testing image features to each business

In [4]:
## Load mapping of testing Photos to Business IDs
test_photo_to_business_id = pd.read_csv(dataset_root + 'test_photo_to_biz.csv')
#print(test_photo_to_business_id[:5])

test_business_ids = test_photo_to_business_id['business_id'].unique()



# Load the testing features from h5 file
X_test_features_file = h5py.File(dataset_root + 'test_images_fc7_features.h5', 'r')
X_test_image_name = list(np.copy(X_test_features_file['image_name']))
X_test_image_features = np.copy(X_test_features_file['image_feature'])
X_test_features_file.close()

print('Done!')

Done!


In [5]:
count = 0
for train_list_imgs in X_test_image_name:
    #print(train_list_imgs)
    if "317818" in train_list_imgs:
        #print(train_list_imgs)
        count += 1

print(count)

1


In [6]:
print(X_test_image_name[:5])

X_test_image_name_short = [name.split('/')[-1] for name in X_test_image_name]

print(X_test_image_name_short[:5])

X_test_image_name_short_without_ext = [name.split('.')[0] for name in X_test_image_name_short]

print(X_test_image_name_short_without_ext[:5])

['/home/ubuntu/test_photos/317818.jpg', '/home/ubuntu/test_photos/30679.jpg', '/home/ubuntu/test_photos/455084.jpg', '/home/ubuntu/test_photos/371381.jpg', '/home/ubuntu/test_photos/86224.jpg']
['317818.jpg', '30679.jpg', '455084.jpg', '371381.jpg', '86224.jpg']
['317818', '30679', '455084', '371381', '86224']


In [7]:
test_data_frame = pd.DataFrame(columns=['business_id', 'features'])
count = 0
t = time.time()

for business in test_business_ids:     
    
    #print('Processing business ID: {}'.format(business))
    
    photo_ids = test_photo_to_business_id[test_photo_to_business_id['business_id'] == business]['photo_id'].tolist()  
    
    #print('Photo IDs: {}'.format(photo_ids))
    
    #for ph in photo_ids:
    #    print('Photo: {}'.format(ph))
    #    print('X test: {}'.format(X_test_image_name_short_without_ext.index(str(ph))))
    
    image_index = [X_test_image_name_short_without_ext.index(str(photo)) for photo in photo_ids]
               
    test_features = X_test_image_features[image_index]
    
    test_mean_feature = list(np.mean(test_features, axis=0))

    test_data_frame.loc[count] = [business, test_mean_feature]
    
    count += 1
    
    if (count % 100) == 0:
        print "Buisness processed: ", count, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"


with open(dataset_root+"test_business_fc7_features.csv",'w') as f:  
    test_data_frame.to_csv(f, index=False)
    
print('Done!')

Buisness processed:  100 Time passed:  11.9 sec
Buisness processed:  200 Time passed:  26.6 sec
Buisness processed:  300 Time passed:  44.1 sec
Buisness processed:  400 Time passed:  60.6 sec
Buisness processed:  500 Time passed:  79.8 sec
Buisness processed:  600 Time passed:  103.4 sec
Buisness processed:  700 Time passed:  127.2 sec
Buisness processed:  800 Time passed:  153.0 sec
Buisness processed:  900 Time passed:  177.6 sec
Buisness processed:  1000 Time passed:  199.4 sec
Buisness processed:  1100 Time passed:  228.3 sec
Buisness processed:  1200 Time passed:  256.1 sec
Buisness processed:  1300 Time passed:  285.2 sec
Buisness processed:  1400 Time passed:  317.3 sec
Buisness processed:  1500 Time passed:  348.1 sec
Buisness processed:  1600 Time passed:  378.3 sec
Buisness processed:  1700 Time passed:  409.8 sec
Buisness processed:  1800 Time passed:  441.1 sec
Buisness processed:  1900 Time passed:  474.1 sec
Buisness processed:  2000 Time passed:  503.7 sec
Buisness proce

In [5]:
# Check file content
test_feature_csv = pd.read_csv(dataset_root+'test_business_fc7_features.csv')
print test_feature_csv.shape
test_feature_csv[0:5]

(10000, 2)


Unnamed: 0,business_id,features
0,003sg,"[0.19304767, 0.25836322, 0.19439411, 0.4623304..."
1,00er5,"[0.19397034, 0.25547439, 0.18416163, 0.3357919..."
2,00kad,"[0.12130528, 0.12655617, 0.076521836, 0.383440..."
3,00mc6,"[0.28427792, 0.11110595, 0.47849005, 0.4494445..."
4,00q7x,"[0.23811768, 0.33041945, 0.25544992, 0.3258045..."


### Use classifier to train on training dataset

In [7]:
X_train_values = train_feature_csv['features'].values
y_train_values = train_feature_csv['label'].values

X_test_values = test_feature_csv['features'].values

In [8]:
print(X_train_values)

[ '[0.19977085, 0.43287092, 0.22732987, 0.35516945, 0.71615601, 0.48242164, 0.25700608, 0.42042705, 0.11370455, 0.46246436, 0.16720486, 0.054374058, 0.0907932, 0.020175731, 0.18348682, 0.30969673, 0.41197327, 0.0, 0.87686944, 0.10616812, 0.18531185, 0.43613097, 0.13220009, 0.3020573, 0.10058945, 0.17077933, 0.12445344, 0.6613254, 0.2930122, 2.3265631, 0.86510044, 0.09295211, 0.89230907, 0.2286039, 0.053337146, 0.32354406, 0.2988604, 0.60970831, 0.32721448, 0.060392652, 0.28084314, 0.59252781, 0.080102488, 1.0434812, 0.34833702, 0.3408621, 0.065701328, 0.17441741, 0.054028537, 0.99625564, 0.1301274, 0.44597104, 0.082854711, 0.15751827, 0.03807757, 0.4758403, 0.92956358, 0.44328284, 0.43603846, 0.3303746, 0.18727754, 0.37221453, 0.40047896, 0.23861945, 0.27623084, 0.2473051, 0.071081884, 0.26580697, 1.1922792, 0.43229681, 0.12081116, 0.44193053, 0.14388748, 1.0380439, 0.18919745, 0.15494864, 0.07529825, 1.0149684, 0.17115878, 0.3605091, 0.20187415, 0.049020406, 0.64571923, 0.60650545, 0.

In [9]:
print(y_train_values)

['(1, 2, 3, 4, 5, 6, 7)' '(0, 1, 6, 8)' '(1, 2, 4, 5, 6, 7)' ..., '(8,)'
 '(1, 2, 4, 5, 6, 7)' '(1, 2, 5, 6, 7)']


In [10]:
print(X_test_values)

[ '[0.19304767, 0.25836322, 0.19439411, 0.46233049, 0.75282729, 0.86500871, 0.58400166, 0.41278434, 0.23449641, 0.60001111, 0.29269242, 0.080840528, 0.03550576, 0.041964535, 0.14465538, 0.38474914, 0.17774166, 0.11074769, 0.23987161, 0.073897474, 0.58541787, 0.53519577, 0.10356888, 0.11838695, 0.10963564, 0.15986352, 0.14954118, 0.78952217, 0.77878922, 2.9940112, 0.41818535, 0.062092911, 1.0142523, 0.18290678, 0.094595335, 0.16363935, 0.61039197, 0.81394053, 0.13072613, 0.10169758, 0.19369426, 0.50034541, 0.036453877, 0.33766815, 0.53862005, 0.15992759, 0.1732109, 0.14597175, 0.11181356, 0.36157838, 0.15477332, 0.28538972, 0.12908757, 0.37431863, 0.090589203, 0.46309793, 0.72369808, 0.58581841, 0.44954219, 0.28005141, 0.17767884, 0.35141066, 0.42937452, 0.19116759, 0.24073179, 0.28265914, 0.14762995, 0.19889295, 1.2787806, 0.1141357, 0.15681113, 0.26659298, 0.27972651, 0.66890115, 0.10893608, 0.2155828, 0.036637284, 1.0914571, 0.28140718, 0.2174443, 0.32902035, 0.16297308, 0.66946626, 

In [24]:
def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x)>0]


def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

In [25]:
X_train_values = np.array([convert_feature_to_vector(y) for y in train_feature_csv['features']])
print(X_train_values)

[[ 0.19977085  0.43287092  0.22732987 ...,  0.17437069  0.03517938
   0.09370409]
 [ 0.          0.58893245  0.53906047 ...,  0.28808415  0.19681689  0.        ]
 [ 0.11155061  0.03482208  0.12025276 ...,  0.20248774  0.26140007
   0.00779067]
 ..., 
 [ 0.13277125  0.56326318  0.26168296 ...,  0.1270705   0.47909904
   0.11014513]
 [ 0.10679593  0.17048742  0.08465095 ...,  0.12400218  0.0319413
   0.06920414]
 [ 0.          0.27781719  0.1645885  ...,  0.09692855  0.17744553
   0.03642482]]


In [26]:
y_train_values = np.array([convert_label_to_array(y) for y in train_feature_csv['label']])

In [27]:
print(y_train_values)

[list([1, 2, 3, 4, 5, 6, 7]) list([0, 1, 6, 8]) list([1, 2, 4, 5, 6, 7])
 ..., list([8]) list([1, 2, 4, 5, 6, 7]) list([1, 2, 5, 6, 7])]


In [28]:
X_test_values = np.array([convert_feature_to_vector(y) for y in test_feature_csv['features']])
print(X_test_values)

[[ 0.19304767  0.25836322  0.19439411 ...,  0.13297254  0.15461819
   0.10284887]
 [ 0.19397034  0.25547439  0.18416163 ...,  0.11142682  0.18038574
   0.24168471]
 [ 0.12130528  0.12655617  0.07652184 ...,  0.11565144  0.27574864
   0.19833703]
 ..., 
 [ 0.11020086  0.18334791  0.08339857 ...,  0.18644631  0.1662205
   0.19536377]
 [ 0.16489188  0.19207232  0.08469325 ...,  0.15259239  0.19171791
   0.11155545]
 [ 0.          0.08239122  0.01744942 ...,  0.02600032  0.8504824
   0.55064517]]


In [29]:


from sklearn import svm, datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

import time
t=time.time()

mlb = MultiLabelBinarizer()
#Convert list of labels to binary matrix
y_train_one_hot_encoded = mlb.fit_transform(y_train_values)

random_state = np.random.RandomState(0)

X_train, X_test, y_train, y_test = train_test_split(X_train_values, y_train_one_hot_encoded, test_size=.25, random_state=random_state)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))

classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"





Time passed:  289.3 sec


In [30]:
from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_test, y_predict, average='micro') 
print "Individual Class F1 score: ", f1_score(y_test, y_predict, average=None)

F1 score:  0.801305970149
Individual Class F1 score:  [ 0.65882353  0.79268293  0.83730159  0.6185567   0.74074074  0.84848485
  0.91691395  0.76056338  0.87254902]


In [37]:
y_predict = classifier.predict(X_test_values)

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

In [38]:
print('Done!')

Done!


In [39]:
print(len(y_predict))
print(len(y_predict_label))

10000
10000


In [41]:
final_test_df = pd.DataFrame(columns=['business_id', 'labels'])

for i in range(len(test_feature_csv)):
    
    biz = test_feature_csv.loc[i]['business_id']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    final_test_df.loc[i] = [str(biz), label]

with open(dataset_root+"submission_fc7.csv",'w') as f:
    final_test_df.to_csv(f, index=False)
    
print('Done')

Done
