# Final Project
Import modules

In [17]:
#!/usr/bin/python

import pickle
import sys

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier

Define the feature that I want to use in the classifier. I will use 'bonus','poi_related_messages' and 'from_messages_poi_ratio'

In [18]:
features_list = ['poi', 'bonus', 'poi_related_messages', 'from_messages_poi_ratio']  # You will need to use more features

Load the dataset and create a dictionary

In [19]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

Remove outliers

In [20]:
data_dict.pop("TOTAL", 0)
data_dict.pop("LOCKHART EUGENE E", 0)
data_dict.pop("THE TRAVEL AGENCY IN THE PARK", 0)

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [21]:
def add_features(original_data_dict):
    """
    adds new features to de dictionary
    """
    for name in original_data_dict:
        try:
            if original_data_dict[name]["from_poi_to_this_person"] != 'NaN':
                original_data_dict[name]['from_messages_poi_ratio'] = 1. * original_data_dict[name]["from_poi_to_this_person"] / original_data_dict[name][
                    "from_messages"]
            else:
                original_data_dict[name]['from_messages_poi_ratio'] = 0

            if original_data_dict[name]["from_this_person_to_poi"] != 'NaN':
                original_data_dict[name]['person_to_poi_ratio'] = 1. * original_data_dict[name]["from_this_person_to_poi"] / original_data_dict[name][
                    "to_messages"]
            else:
                original_data_dict[name]['person_to_poi_ratio'] = 0

            poi_related_messages = original_data_dict[name]["from_poi_to_this_person"] + original_data_dict[name]["from_this_person_to_poi"] + \
                                   original_data_dict[name]["shared_receipt_with_poi"]
            if 'NaN' not in poi_related_messages:
                original_data_dict[name]['poi_related_messages'] = poi_related_messages
            else:
                original_data_dict[name]['poi_related_messages'] = 0
        except:
            original_data_dict[name]['poi_related_messages'] = 0

    return original_data_dict

In [22]:
def setup_clf_list():
    """
    Instantiates all classifiers of interstes to be used.
    """
    # List of tuples of a classifier and its parameters.
    clf_list = []

    #
    clf_naive = GaussianNB()
    params_naive = {}
    clf_list.append((clf_naive, params_naive))

    #
    clf_tree = DecisionTreeClassifier()
    params_tree = {"min_samples_split": [2, 5, 10, 20],
                   "criterion": ('gini', 'entropy')
                   }
    clf_list.append((clf_tree, params_tree))

    #
    clf_linearsvm = LinearSVC()
    params_linearsvm = {"C": [0.5, 1, 5, 10, 100, 10 ** 10],
                        "tol": [10 ** -1, 10 ** -10],
                        "class_weight": ['balanced']

                        }
    clf_list.append((clf_linearsvm, params_linearsvm))

    #
    clf_adaboost = AdaBoostClassifier()
    params_adaboost = {"n_estimators": [20, 25, 30, 40, 50, 100]
                       }
    clf_list.append((clf_adaboost, params_adaboost))

    #
    clf_random_tree = RandomForestClassifier()
    params_random_tree = {"n_estimators": [2, 3, 5],
                          "criterion": ('gini', 'entropy')
                          }
    clf_list.append((clf_random_tree, params_random_tree))

    #
    clf_knn = KNeighborsClassifier()
    params_knn = {"n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9], "p": [2, 3, 4]}
    clf_list.append((clf_knn, params_knn))

    #
    clf_log = LogisticRegression()
    params_log = {"C": [0.05, 0.5, 1, 10, 10 ** 2, 10 ** 5, 10 ** 10, 10 ** 20],
                  "tol": [10 ** -1, 10 ** -5, 10 ** -10],
                  "class_weight": ['balanced']
                  }
    clf_list.append((clf_log, params_log))

    #
    clf_lda = LinearDiscriminantAnalysis()
    params_lda = {"n_components": [0, 1, 2, 5, 10]}
    clf_list.append((clf_lda, params_lda))

    #
    logistic = LogisticRegression()
    rbm = BernoulliRBM()
    clf_rbm = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    params_rbm = {
        "logistic__tol": [10 ** -10, 10 ** -20],
        "logistic__C": [0.05, 0.5, 1, 10, 10 ** 2, 10 ** 5, 10 ** 10, 10 ** 20],
        "logistic__class_weight": ['balanced'],
        "rbm__n_components": [2, 3, 4]
    }
    clf_list.append((clf_rbm, params_rbm))

    return clf_list

In [23]:
def search_best_clf():
    # type: () -> GridSearchCV
    """
    Search for the best classifier and return it
    :rtype: GridSearchCV
    """
    f1score = 0
    best_clf = None
    for clftemp, params in setup_clf_list():
        scorer = make_scorer(f1_score)
        clftemp = GridSearchCV(clftemp, params, scoring=scorer)
        clftemp = clftemp.fit(features_train, labels_train)
        pred = clftemp.predict(features_test)
        f1_score_temp = f1_score(labels_test, pred)
        if f1_score_temp > f1score:
            f1score = f1_score_temp
            best_clf = clftemp.best_estimator_

    pred = best_clf.predict(features_test)
    return best_clf

In [24]:
my_dataset = add_features(data_dict)

Extract features and labels from dataset for local testing

In [25]:
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

Split the data into test and feature

In [26]:
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

Perform the search for a best classifier and then call the ***test_classifier*** function to get the results

In [27]:
clf = search_best_clf()
dump_classifier_and_data(clf, my_dataset, features_list)
test_classifier(clf, my_dataset, features_list)

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=1e-10, verbose=0, warm_start=False)
	Accuracy: 0.77690	Precision: 0.45444	Recall: 0.57600	F1: 0.50805	F2: 0.54675
	Total predictions: 10000	True positives: 1152	False positives: 1383	False negatives:  848	True negatives: 6617

