In [6]:
'''
Classifier for Bee Wing Identification

Reads in csv file with list of features as well as labeled class,
and uses a specified Classifer to train and test the data. Can save model.

'''
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, ShuffleSplit
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import pickle
from sklearn.externals import joblib

class BeeClassifier:
    
    def __init__(self):
        self.train_ratio = 0.8
        self.model = None
    
    def set_train_ratio(self, ratio):
        self.train_ratio = ratio
        
    def get_classes(self, data):
        bee_data = pd.read_csv(data)
        factorized = pd.factorize(bee_data['species'])
        return factorized[1]
    
    def train(self, data, classifier='random forest', train_ratio=0.8, display_accuracy = True, save=True, num_species = 7):
        '''Classifies the species of bees given a set of data.
        
           data: a csv file containing data and labels
           classifier: the classifier used to train the model
               options: ['random forest'(default), 'svm', 'k-nearest']
           train_ratio: the amount of data split used to train the model
           display_accuracy: displays the accuracy of the model in confusion matrix
           save: saves the model
        '''
        #Loads in data, separates out species labels
        bee_data = pd.read_csv(data)
        labels = bee_data['species']

        #Splits up the bee_data into training and test set, factorizes it into numbers, instead of species
        bee_data['is_train'] = np.random.uniform(0, 1, len(bee_data)) <= train_ratio
        train_data, test_data = bee_data[bee_data['is_train']==True], bee_data[bee_data['is_train']==False]
        features = bee_data.columns[:3]
        print(features)
        self.train_data, self.test_data, features= train_data, test_data, features
        factorized = pd.factorize(train_data['species'])
        training_labels, testing_labels, classes = factorized[0], pd.factorize(test_data['species'])[0], factorized[1]
        self.classes = classes
        train, test = train_data[features], test_data[features]
        
        print("Classes " + classes)

        #Creates a Classifier, fits it on training data, and predicts on test data
        if classifier=='k-nearest':
            clf = KNeighborsClassifier(7)
        elif classifier=='svm':
            clf = SVC()
        else:
            clf = RandomForestClassifier(random_state=42)
            
        clf.fit(train, training_labels)
        predictions = clf.predict(test)
        
        #Saves model as part of BeeClassifier object
        if save:
            self.model = clf
            
        #Displays the accuracy of the model using test.
        if display_accuracy and train_ratio < 1:
            self.test()
            
    
    def test(self, data=None, classes=None):
        if data is not None:
            self.test_data = data
        if classes is not None:
            self.classes = classes
        if self.model is not None and self.classes is not None:
            clf = self.model
            features = self.test_data.columns[:3]
            test = self.test_data[features]

            predictions = clf.predict(test)
            factorized = pd.factorize(self.test_data['species'])
            training_labels, testing_labels = factorized[0], pd.factorize(self.test_data['species'])[0]
            
            #Calculates accuracy of the prediction on the test data
            accuracyCLF = accuracy_score(testing_labels, predictions)
            print("Classifier Accuracy: ", accuracyCLF)

            #Displays a confusion of classified species
            print("\nConfusion Matrix: ")
            print(predictions)
            print(self.classes)
            preds = self.classes[predictions]
            testing_labs = self.classes[testing_labels]
            confusion_mat = pd.crosstab(testing_labs, preds, rownames = ['Actual Species'], colnames = ['Predicted Species'])
            print(confusion_mat, "\n\n")
        else:
            print("No model to test. Please train a model using train ")
        
    
    def cross_validate(self, data, classifier='random forest'):
        #Cross Validation
        print("Performing Cross Validation...")
        bee_data = pd.read_csv(data)
        target = bee_data['species']
        data = bee_data[bee_data.columns[:3]]
        
        #Splits training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)
        
        #Chooses a classifier
        if classifier=='k-nearest':
            clf = KNeighborsClassifier(7)
        elif classifier=='svm':
            clf = SVC()
        else:
            clf = RandomForestClassifier(random_state=42)

        scores = cross_val_score(clf, data, target, cv = 4)
        print("Cross Validation Scores (Test Size 0.25): ", scores)
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

        cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
        scores2 = cross_val_score(clf, data, target, cv=cv)
        print("Cross Validation Scores (Shuffled, Test Size 0.3)", cross_val_score(clf, data, target, cv=cv))
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))
    
    def save_model(self, file_name="model.pkl"):
        '''
        Saves model in saved_models folder.
        '''
        if self.model:
            joblib.dump(clf, "saved_models/" + file_name) 
            print("Model saved successfully!")
        else:
            print("No model to save. Run train first.")
            
    def load_model(self, classes, file_name="model.pkl"):
        '''
        Loads model from saved_models folder, saves it in object.
        '''
        try:
            self.model = joblib.load('saved_models/'+file_name)
            print(self.model)
        except:
            print("Unable to find file named " + file_name)
    
    def find_best_rf_model(self, data, train_ratio=0.8, param_grid=None):
        #Loads in data, separates out species labels
        bee_data = pd.read_csv(data)
        labels = bee_data['species']

        #Splits up the bee_data into training and test set, factorizes it into numbers, instead of species
        bee_data['is_train'] = np.random.uniform(0, 1, len(bee_data)) <= train_ratio
        train_data, test_data = bee_data[bee_data['is_train']==True], bee_data[bee_data['is_train']==False]
        features = bee_data.columns[:3]
        factorized = pd.factorize(train_data['species'])
        training_labels, testing_labels, classes = factorized[0], pd.factorize(test_data['species'])[0], factorized[1]
        train, test = train_data[features], test_data[features]

        rf = RandomForestClassifier(random_state=42)
        if param_grid is None:
            param_grid = {
                'n_estimators': [5, 10, 15, 20, 25, 30],
                'max_depth': [2, 5, 7, 9, 11, 15],
                'max_features':["auto", "sqrt", "log2"],
                #'min_impurity_decrease': [0, 0.1]
            }

        clf = GridSearchCV(rf, param_grid)
        clf.fit(train, training_labels)

        print("Accuracy:", clf.best_score_)
        print(clf.best_estimator_)
        
        best_model.fit(train, training_labels)
        predictions = best_model.predict(test)

        accuracyCLF = accuracy_score(testing_labels, predictions)
        print(accuracyCLF)
        return clf.best_estimator_

In [7]:
# clf = BeeClassifier()
# clf.train("bee_info.csv", classifier='k-nearest', display_accuracy = True)
# clf.cross_validate("bee_info.csv", classifier='k-nearest')
# clf.save_model()
# data = clf.test_data

# classes = clf.get_classes("bee_info.csv")
# clf.test()

In [8]:
# clf2 = BeeClassifier()
# # print(data)
# clf2.load_model(classes=classes)
# #print(clf2.model)
# clf2.test(data, classes)

In [9]:
# clf = BeeClassifier()
# clf.train("bee_wing_features.csv", classifier='random forest', display_accuracy=True)
# # clf.cross_validate("bee_wing_features.csv", classifier='random forest')

In [10]:
clf = BeeClassifier()
clf.find_best_rf_model("bee_wing_features.csv")

Accuracy: 0.684824902724
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
0.0718562874251


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [11]:
# best_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=7, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
#             oob_score=False, random_state=42, verbose=0, warm_start=False)
# best_model.fit(train, training_labels)
# predictions = best_model.predict(test)

# accuracyCLF = accuracy_score(testing_labels, predictions)
# print(accuracyCLF)

In [20]:
from sklearn.model_selection import GridSearchCV

#Loads in data, separates out species labels
bee_data = pd.read_csv("bee_wing_features.csv")
labels = bee_data['species']
train_ratio = 0.8

#Splits up the bee_data into training and test set, factorizes it into numbers, instead of species
bee_data['is_train'] = np.random.uniform(0, 1, len(bee_data)) <= train_ratio
train_data, test_data = bee_data[bee_data['is_train']==True], bee_data[bee_data['is_train']==False]
features = bee_data.columns[:3]
factorized = pd.factorize(train_data['species'])
training_labels, testing_labels, classes = factorized[0], pd.factorize(test_data['species'])[0], factorized[1]
train, test = train_data[features], test_data[features]

rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30],
    'max_depth': [2, 5, 7, 9, 11, 15],
    'max_features':["auto", "sqrt", "log2"],
    #'min_impurity_decrease': [0, 0.1]
}

clf = GridSearchCV(rf, param_grid)
clf.fit(train, training_labels)

print("Accuracy:", clf.best_score_)
print(clf.best_estimator_)

In [19]:
best_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
best_model.fit(train, training_labels)
predictions = best_model.predict(test)

accuracyCLF = accuracy_score(testing_labels, predictions)
print(accuracyCLF)