In [27]:
#!/usr/bin/python

# General imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pprint
import seaborn as sns
import sys

# Udacity module imports
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
import tester
from tester import dump_classifier_and_data
from tester import test_classifier

# Scikit-Learn imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Imbalanced-Learn imports
from imblearn.over_sampling import ADASYN



In [None]:
# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [3]:
# Task 1: Select what features you'll use.
# features_list is a list of strings, each of which is a feature name.
# The first feature must be "poi".

feature_list = [
    'poi',  
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    #'email_address',
    'exercised_stock_options',
    'expenses',
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary',
    'shared_receipt_with_poi',
    'to_messages',
    'total_payments',
    'total_stock_value'
]  



In [4]:
# Task 2: Remove outliers
data_dict.pop('TOTAL')

{'bonus': 97343619,
 'deferral_payments': 32083396,
 'deferred_income': -27992891,
 'director_fees': 1398517,
 'email_address': 'NaN',
 'exercised_stock_options': 311764000,
 'expenses': 5235198,
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 83925000,
 'long_term_incentive': 48521928,
 'other': 42667589,
 'poi': False,
 'restricted_stock': 130322299,
 'restricted_stock_deferred': -7576788,
 'salary': 26704229,
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 309886585,
 'total_stock_value': 434509511}

In [None]:
# Import data into dataframe for EDA
df = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys()))
df.set_index(employees, inplace=True)
df.replace('NaN', 0, inplace=True)

In [None]:
# EDA

#print('Shape: {}'.format(df.shape))
#print(df.describe())
#sns.boxplot(x='poi', y='bonus', data=df)
sns.boxplot(x='poi', y='to_messages', data=df)
plt.show()

In [None]:
# Task 3: Create new feature(s)



In [39]:
# Task 4: Try a varity of classifiers
# Please name your classifier clf for easy export below.
# Note that if you want to do PCA or other multi-stage operations,
# you'll need to use Pipelines. For more info:
# http://scikit-learn.org/stable/modules/pipeline.html

# Extract features and labels from dataset for local testing
dataset = featureFormat(data_dict, feature_list, sort_keys=True)
labels, features = targetFeatureSplit(dataset)

# Scale features 
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)
    
# Perform additional features selection
feature_selection = SelectKBest(score_func=chi2, k=10)
feature_selection.fit(X_train, y_train)
pprint.pprint(np.array(feature_list)[1:][feature_selection.get_support()])
X_train = feature_selection.transform(X_train)
X_test = feature_selection.transform(X_test)

array(['bonus', 'exercised_stock_options', 'expenses', 'loan_advances',
       'long_term_incentive', 'other', 'salary', 'shared_receipt_with_poi',
       'total_payments', 'total_stock_value'],
      dtype='|S25')


In [None]:
# Create first classifier
clf = AdaBoostClassifier(
    base_estimator=GaussianNB(), 
    n_estimators=300, 
    random_state=42
)
clf.fit(X_train, y_train)

# Test first classifier
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['non-poi', 'poi']))

In [None]:
# Create second classifier
upsampler = ADASYN(ratio='minority', random_state=42)
X_train, y_train = upsampler.fit_sample(X_train, y_train)

clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(class_weight='balanced'), 
    n_estimators=300, 
    random_state=42
)
clf.fit(X_train, y_train)

# Test second classifier
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['non-poi', 'poi']))

In [13]:
# Create second classifier
upsampler = ADASYN(ratio='minority', random_state=42)
print(X_train.shape)
print(sum(y_train))
X_train, y_train = upsampler.fit_sample(X_train, y_train)
print(X_train.shape)
print(sum(y_train))

clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(class_weight='balanced'), 
    n_estimators=300, 
    random_state=42
)
clf.fit(X_train, y_train)

# Test second classifier
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['non-poi', 'poi']))

(100, 10)
13.0
(171, 10)
84.0
             precision    recall  f1-score   support

    non-poi       0.97      0.77      0.86        39
        poi       0.31      0.80      0.44         5

avg / total       0.89      0.77      0.81        44



In [25]:
# Refresh
del(dataset, labels, features, X_train, X_test, y_train, y_test, y_pred,
    scaler, upsampler, feature_selection, classifier, pipe, scorer, param_grid, clf)


# Start with data from scratch
dataset = featureFormat(data_dict, feature_list, sort_keys=True)
labels, features = targetFeatureSplit(dataset)

# Scale features before splitting into train/test sets
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

# Upsample the 'poi' class using the ADASYN algorithm
upsampler = ADASYN(ratio='minority', random_state=42)
X_train, y_train = upsampler.fit_sample(X_train, y_train)

# Define pipeline
feature_selection = SelectKBest(score_func=chi2)
classifier = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(class_weight='balanced'), 
    random_state=42
)
pipe = Pipeline([
    ('feature_selection', feature_selection),
    ('classifier', classifier)
])

# Tune classifier with grid search 
scorer = make_scorer(fbeta_score, beta=0.5, average='weighted')
param_grid = [
    {
        'feature_selection': [PCA()],
        'feature_selection__n_components': [2, 5, 10, 15]
    },
    {
        'feature_selection': [SelectKBest(score_func=chi2)],
        'feature_selection__k': [2, 5, 10, 15], 
        'classifier__n_estimators': [200, 300, 400, 1000]
    }
]

clf = GridSearchCV(pipe, param_grid=param_grid, scoring=scorer)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['non-poi', 'poi']))
#pprint.pprint(clf.cv_results_)

             precision    recall  f1-score   support

    non-poi       0.94      0.77      0.85        39
        poi       0.25      0.60      0.35         5

avg / total       0.86      0.75      0.79        44



In [31]:
dump_classifier_and_data(clf, dataset, feature_list)

AttributeError: 'numpy.ndarray' object has no attribute 'keys'

In [None]:
# Create second classifier
scaler = MinMaxScaler()
feature_selection = SelectKBest(score_func=chi2, k=5)
classifier = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=300, random_state=42)

clf = Pipeline([
    ('scaler', scaler),
    ('feature_selection', feature_selection),
    ('classifier', classifier)
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['non-poi', 'poi']))

In [9]:
pprint.pprint(dir())

['ADASYN',
 'AdaBoostClassifier',
 'DecisionTreeClassifier',
 'GaussianNB',
 'GridSearchCV',
 'In',
 'MinMaxScaler',
 'Out',
 'PCA',
 'Pipeline',
 'RandomForestClassifier',
 'SVC',
 'SelectFromModel',
 'SelectKBest',
 'X_test',
 'X_train',
 '_',
 '_4',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__name__',
 '__package__',
 '_dh',
 '_i',
 '_i1',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_sh',
 'chi2',
 'classification_report',
 'classifier',
 'clf',
 'data_dict',
 'data_file',
 'dataset',
 'dump_classifier_and_data',
 'exit',
 'f1_score',
 'fbeta_score',
 'featureFormat',
 'feature_list',
 'feature_selection',
 'features',
 'get_ipython',
 'labels',
 'make_scorer',
 'np',
 'param_grid',
 'pd',
 'pickle',
 'pipe',
 'plt',
 'pprint',
 'quit',
 'scaler',
 'scorer',
 'sns',
 'sys',
 'targetFeatureSplit',
 'test_classifier',
 'train_test_split',
 'upsampler',
 'y_pred',
 'y_test',
 'y_train']


In [None]:




# Task 5: Tune your classifier to achieve better than .3 precision and recall
# using our testing script. Check the tester.py script in the final project
# folder for details on the evaluation method, especially the test_classifier
# function. Because of the small size of the dataset, the script uses
# stratified shuffle split cross validation. For more info:
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

#test_classifier(clf, data_dict, feature_list, folds = 1000)

# Task 6: Dump your classifier, dataset, and features_list so anyone can
# check your results. You do not need to change anything below, but make sure
# that the version of poi_id.py that you submit can be run on its own and
# generates the necessary .pkl files for validating your results.

# dump_classifier_and_data(clf, my_dataset, features_list)


if __name__ == '__main__':
    pass
    #test_multiple(classifier_types, features_train, features_test, labels_train,
    #labels_test)


