In [2]:
%matplotlib inline

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import glob
import os
import sys
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import tree
from sklearn.cluster import KMeans
import cv2
import time


In [3]:
start_time = time.time()

# appending all labels from data set
labels_total = []
img_dir=glob.glob('.../Training/*')
labels_total={d:i for i,d in enumerate(np.unique([label.split("/")[-1] for label in img_dir]))}

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.007411003112792969 seconds ---


In [4]:
start_time = time.time()

# upload train data from local folder and separte them into two arrays(images, labels)
images_train = []
labels_train = []
img_dir=glob.glob('.../Training/*')

for label in img_dir:
    fruit_label = label.split("/")[-1]
    
    for img_dir in glob.glob(os.path.join(label, "*.jpg")):
        labels_train.append(labels_total[fruit_label])
        image = cv2.imread(img_dir)
        image = cv2.resize(image, (45,45))
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        images_train.append(image.flatten())

images_train = np.array(images_train)
labels_train = np.array(labels_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 9.064826011657715 seconds ---


In [5]:
print("the number of train images : {}".format(labels_train.shape[0]))

the number of train images : 28736


In [6]:
start_time = time.time()

# upload test data from local folder and separte them into two arrays(images, labels)
images_test = []
labels_test = []
img_dir2 = glob.glob('.../Validation/*')

for label in img_dir2:
    fruit_label = label.split("/")[-1]
    
    for img_dir2 in glob.glob(os.path.join(label, "*.jpg")):
        labels_test.append(labels_total[fruit_label])
        image = cv2.imread(img_dir2)
        image = cv2.resize(image, (45,45))
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        images_test.append(image.flatten())

images_test = np.array(images_test)
labels_test = np.array(labels_test)


print("--- %s seconds ---" % (time.time() - start_time))

--- 2.9973440170288086 seconds ---


In [7]:
print("the number of train images : {}".format(labels_test.shape[0]))

the number of train images : 9673


In [8]:
# create RandomForest classifier
forest = RandomForestClassifier(n_estimators=50)

In [9]:
start_time = time.time()

# fit train model using Random Forest
forest = forest.fit(images_train, labels_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 48.95303821563721 seconds ---


In [91]:
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
# calculate accuracy score
test_predict = forest.predict(images_test)
rf_accuracy = accuracy_score(test_predict, labels_test)

print("Random Forest accuracy score : {:.2f}".format(rf_accuracy))

Random Forest accuracy score : 0.96


In [12]:
start_time = time.time()

# create Decision Tree Classfier
tree_clf = tree.DecisionTreeClassifier()
tree_fit = tree_clf.fit(images_train, labels_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 108.29985785484314 seconds ---


In [19]:
start_time = time.time()

tree_prediction = tree_fit.predict(images_test)
tree_accuracy = accuracy_score(tree_prediction, labels_test)

print("Decision Tree accuracy score : {:.2f}".format(tree_accuracy))
print("--- %s seconds ---" % (time.time() - start_time))


Decision Tree accuracy score : 0.78


In [22]:
start_time = time.time()

# K-Means for clustering certain fruits into groups by its look

kmean_clf = KMeans(n_clusters = 15).fit(images_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 204.49792408943176 seconds ---


In [23]:
kmean_clf

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)