In [1]:
# Import the usual suspects. Any new functions will be introduced individually for clarity.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification #generates a random classification problem
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions #plotting regions
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.datasets.california_housing import fetch_california_housing
import sklearn.linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import sklearn.svm


# make prettier plots
# %config InlineBackend.figure_format = 'svg' 

In [2]:
df = pd.read_pickle("./raw_data.pkl")

In [3]:
# df = df[pd.notnull(df['last_funding_amount'])]

In [4]:
# df['days_since_last_funding'] = df['days_since_last_funding'].astype(int)

In [5]:
# df = df.fillna(0)

In [6]:
X = df.loc[:, df.columns != 'target']
y = df.target

In [7]:
print(X.shape)
print(y.value_counts())

(7245, 55)
0    7020
1     225
Name: target, dtype: int64


#### Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train , test_size=0.25, random_state=42)

In [9]:
clf = XGBClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_val)

In [10]:
recall = metrics.recall_score(y_val,y_pred)
print(recall)

0.5116279069767442


#### KNN

In [11]:
knn_param = {'n_neighbors' : range(1,100), 'algorithm':['auto', 'ball_tree','kd_tree','brute'], 
             'weights':['uniform', 'distance']}

In [12]:
knn = KNeighborsClassifier()
rand_search = RandomizedSearchCV(knn,knn_param, cv=5,scoring='recall',iid=True)
rand_search.fit(X_train, y_train)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.5928554996877977
{'weights': 'distance', 'n_neighbors': 61, 'algorithm': 'brute'}


In [13]:
# Train on training set, and Test on testing set
knn = KNeighborsClassifier(n_neighbors=25, weights = 'distance', algorithm='ball_tree')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_val)

print(f"Accuracy Score: {metrics.accuracy_score(y_val, y_pred)}")
print(f"Recall Score: {metrics.recall_score(y_val,y_pred)}")
print(f"F1 Score: {metrics.f1_score(y_val,y_pred)}")

Accuracy Score: 0.9710144927536232
Recall Score: 0.5116279069767442
F1 Score: 0.5116279069767442


#### Decision Tree

In [14]:
decision_tree_param = {"class_weight": ['balanced',None],"max_depth": [3, None],
              "max_features": ['auto','sqrt','log2'],
              "min_samples_split": range(10,500,10),
              "criterion": ["gini", "entropy"],
                       'max_depth':range(1,10,2),
                      'random_state': [42]}

In [15]:
decision_tree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(decision_tree, decision_tree_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train, y_train)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.8571346412961319
{'random_state': 42, 'min_samples_split': 10, 'max_features': 'auto', 'max_depth': 5, 'criterion': 'gini', 'class_weight': 'balanced'}


In [16]:
decision_tree = DecisionTreeClassifier(criterion='entropy',class_weight='balanced'
                                       ,random_state = 42, max_features = 'sqrt', min_samples_leaf = 2
                                       ,min_samples_split = 400, max_depth = 9)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_val)


print(f"Accuracy Score: {metrics.accuracy_score(y_val, y_pred)}")
print(f"Recall Score: {metrics.recall_score(y_val,y_pred)}")
print(f"F1 Score: {metrics.f1_score(y_val,y_pred)}")


Accuracy Score: 0.8992408557625949
Recall Score: 0.813953488372093
F1 Score: 0.32407407407407407


#### Logistic Regression

In [17]:
logistic_regression_param = {"C":np.logspace(-4, 4, 20), "penalty":["l2"]}

In [18]:
logistic_regression = LogisticRegression(solver="lbfgs")
rand_search = RandomizedSearchCV(logistic_regression, logistic_regression_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train, y_train)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.14285714285714285
{'penalty': 'l2', 'C': 0.00026366508987303583}


In [19]:
logistic_regression = LogisticRegression(penalty= 'l2', C=1.623776739188721)
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_val)

print(f"Accuracy Score: {metrics.accuracy_score(y_val, y_pred)}")
print(f"Recall Score: {metrics.recall_score(y_val,y_pred)}")
print(f"F1 Score: {metrics.f1_score(y_val,y_pred)}")

Accuracy Score: 0.9675638371290545
Recall Score: 0.13953488372093023
F1 Score: 0.20338983050847456




#### Random Forest

In [20]:
random_forest_param = {'n_estimators': [4,5,6,7,8, 9], 
              'max_features': ['log2', 'sqrt','auto',None],
              'criterion': ['entropy', 'gini'], 
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8], 'class_weight' : ['balanced','balanced_subsample', None]}


In [21]:
random_forest = RandomForestClassifier(n_jobs = -1)
rand_search = RandomizedSearchCV(random_forest, random_forest_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train, y_train)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.8428390679943476
{'n_estimators': 4, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 3, 'criterion': 'gini', 'class_weight': 'balanced'}


In [22]:
random_forest = RandomForestClassifier(n_estimators= 7, min_samples_split=5
                                       ,min_samples_leaf= 8, max_features= 'log2'
                                       , max_depth= 10, criterion= 'entropy'
                                       ,class_weight='balanced_subsample')
random_forest.fit(X_train, y_train)
y_pred_adasyn = random_forest.predict(X_val)

print(f"Accuracy Score: {metrics.accuracy_score(y_val, y_pred)}")
print(f"Recall Score: {metrics.recall_score(y_val,y_pred)}")
print(f"F1 Score: {metrics.f1_score(y_val,y_pred)}")

Accuracy Score: 0.9675638371290545
Recall Score: 0.13953488372093023
F1 Score: 0.20338983050847456


#### Support Vector Machine

In [25]:
svc_model_param = {'kernel':['linear', 'poly', 'rbf','sigmoid']
                   ,'gamma':[0.1, 1, 10, 100]
                   ,'C':[0.1, 1, 10, 100, 1000]
                   , 'degree':[0, 1, 2, 3, 4, 5, 6]}


In [None]:
svc_model = svm.SVC()
rand_search = RandomizedSearchCV(svc_model, svc_model_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train, y_train)
print(rand_search.best_score_)
print(rand_search.best_params_)


In [None]:
svc_model = svm.SCV()

scv_model.fit(X_train, y_train)
y_pred = scv_model.predict(X_val)

print(f"Accuracy Score: {metrics.accuracy_score(y_val, y_pred)}")
print(f"Recall Score: {metrics.recall_score(y_val,y_pred)}")
print(f"F1 Score: {metrics.f1_score(y_val,y_pred)}")