In [1]:
from PIL import Image
import numpy as np
import os
from random import shuffle
import matplotlib.pyplot as plt

DIR = 'C:\Yelp\Train'

# Want to know how we should format the height x width image data dimensions
# for inputting to a keras model
def get_size_statistics():
    heights = []
    widths = []
    img_count = 0
    for img in os.listdir(DIR):
        path = os.path.join(DIR, img)
        if "DS_Store" not in path:
            data = np.array(Image.open(path))
            heights.append(data.shape[0])
            widths.append(data.shape[1])
            img_count += 1
    avg_height = sum(heights) / len(heights)
    avg_width = sum(widths) / len(widths)
    print("Average Height: " + str(avg_height))
    print("Max Height: " + str(max(heights)))
    print("Min Height: " + str(min(heights)))
    print('\n')
    print("Average Width: " + str(avg_width))
    print("Max Width: " + str(max(widths)))
    print("Min Width: " + str(min(widths)))
get_size_statistics()

Average Height: 383.958217270195
Max Height: 400
Min Height: 150


Average Width: 482.1309192200557
Max Width: 600
Min Width: 200


In [2]:
def label_img(name):
    word_label = name.split('(')[0]
    #print(word_label)
    if word_label == 'Burger ':  return 1
    elif word_label == 'pizza ': return 2
    elif word_label == 'Chicken ': return 3
    elif word_label == 'Sweet ': return 4
    elif word_label == 'Tacos ': return 5
    elif word_label == 'Drink ': return 6

In [3]:
IMG_SIZE = 256
def load_training_data():
    train_data = []
    for img in os.listdir(DIR):
        label = label_img(img)
        path = os.path.join(DIR, img)
        if "DS_Store" not in path:
            img = Image.open(path)
            img = img.convert('L')
            img = img.resize((IMG_SIZE, IMG_SIZE), Image.ANTIALIAS)
            train_data.append([np.array(img), label])
            X_train=np.array(img)
            Y_train=label
    shuffle(train_data)
    return train_data

In [7]:
train_data = np.array(load_training_data())
y=len(train_data)
X=[]
Y=[]

In [8]:
for i in range(y-1):
    img=np.array((train_data[i][0]).reshape(256*256))
    X.append(img)
    label=np.array(train_data[i][1])
    Y.append(label)
    #count=count+1
    #plt.imshow(some_image)
    #plt.show()

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    random_state=42,
                                                    test_size=0.20)

In [13]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=12)
sgd.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=12, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
print("model score: %.3f" % sgd.score(X_test,y_test))

model score: 0.306


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import LinearSVC
polynomial_svm_clf=LinearSVC(random_state=42)
polynomial_svm_clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [29]:
print("model score: %.3f" % polynomial_svm_clf.score(X_test,y_test))

model score: 0.292


In [26]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='uniform')

In [27]:
acc=knn.score(X_test, y_test)
acc

0.3888888888888889

In [23]:
from sklearn.ensemble import RandomForestClassifier
rand_clf=RandomForestClassifier(random_state=42)
rand_clf.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [24]:
acc=rand_clf.score(X_test, y_test)
acc

0.4027777777777778

In [31]:
from sklearn.tree import DecisionTreeClassifier
dec_clf=DecisionTreeClassifier(random_state=42)
dec_clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [32]:
acc=dec_clf.score(X_test, y_test)
acc

0.3472222222222222

In [36]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
acc=gnb.score(X_test, y_test)
acc

0.2777777777777778

In [41]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('sgd', sgd), ('rf', rand_clf) ,('dt', dec_clf),('svm', polynomial_svm_clf),('knn', knn)],
    voting='hard')
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
   ...ki',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [42]:
acc=voting_clf.score(X_test, y_test)
acc

0.4027777777777778