In [2]:
# - f beta
# - predic_proba
# - look at categories more likely to succeed
# - classifying what stage the startup is on, whether or not you will invest on 


# Import the usual suspects. Any new functions will be introduced individually for clarity.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification #generates a random classification problem
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions #plotting regions
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.datasets.california_housing import fetch_california_housing
import sklearn.linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import sklearn.svm
from imblearn.over_sampling import ADASYN

# make prettier plots
# %config InlineBackend.figure_format = 'svg' 

Using TensorFlow backend.


In [3]:
df = pd.read_pickle("./raw_data.pkl")

In [4]:
df = df.drop(columns=('number_of_funding_rounds'))
df.shape

(7245, 55)

In [5]:
X = df.loc[:, df.columns != 'target']
y = df.target

#### Create ADASYN

In [6]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X,y)

In [7]:
X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn = train_test_split(X_adasyn,y_adasyn 
                                                                                , test_size=0.2
                                                                                , random_state=42)

X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_train_adasyn
                                                                              ,y_train_adasyn 
                                                                              , test_size=0.25
                                                                              , random_state=42)

In [8]:
Counter(y_adasyn)

Counter({0: 7020, 1: 7021})

#### XGBoost

In [9]:
clf = XGBClassifier().fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = clf.predict(X_val_adasyn)

In [10]:
recall = metrics.recall_score(y_val_adasyn,y_pred_adasyn)
print(f"Recall: {recall}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Recall: 0.992226148409894
FBeta: 0.9861866077600197


#### KNN

In [13]:
knn_param = {'n_neighbors' : range(1,100), 'algorithm':['auto', 'ball_tree','kd_tree','brute'], 
             'weights':['uniform', 'distance']}

In [14]:
knn = KNeighborsClassifier()
rand_search = RandomizedSearchCV(knn,knn_param, cv=5,scoring='recall',iid=True)
rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9270092841750222
{'weights': 'distance', 'n_neighbors': 22, 'algorithm': 'ball_tree'}


In [15]:
# Train on training set, and Test on testing set
knn = KNeighborsClassifier(n_neighbors=24, weights = 'distance', algorithm='ball_tree')
knn.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn_knn = knn.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn_knn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn_knn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn_knn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn_knn, average='macro', beta=0.4)}")

Accuracy Score: 0.9202279202279202
Recall Score: 0.9335689045936396
F1 Score: 0.9218422889043963
FBeta: 0.9204345509211385


#### Decision Tree

In [18]:
decision_tree_param = {"class_weight": ['balanced',None],"max_depth": [3, None],
              "max_features": ['auto','sqrt','log2'],
              "min_samples_split": range(10,500,10),
              "criterion": ["gini", "entropy"],
                       'max_depth':range(1,10,2),
                      'random_state': [42]}



In [19]:
decision_tree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(decision_tree, decision_tree_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9334234831056001
{'random_state': 42, 'min_samples_split': 20, 'max_features': 'log2', 'max_depth': 7, 'criterion': 'gini', 'class_weight': None}


In [20]:
decision_tree = DecisionTreeClassifier(criterion='entropy',class_weight= None
                                       ,random_state = 42, max_features = 'auto', min_samples_leaf = 2
                                       ,min_samples_split = 10
                                       ,max_depth = 9)
decision_tree.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = decision_tree.predict(X_val_adasyn)


print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.9433760683760684
Recall Score: 0.9434628975265018
F1 Score: 0.9437963944856841
FBeta: 0.9433712536832974


#### Logistic Regression

In [21]:
logistic_regression_param = {"C":np.logspace(-4, 4, 20), "penalty":["l2"]}


In [22]:
logistic_regression = LogisticRegression(solver="lbfgs")
rand_search = RandomizedSearchCV(logistic_regression, logistic_regression_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9360496163771823
{'penalty': 'l2', 'C': 0.0006951927961775605}


In [23]:
logistic_regression = LogisticRegression(penalty= 'l2', C=0.0001)
logistic_regression.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = logistic_regression.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.7183048433048433
Recall Score: 0.9194346289752651
F1 Score: 0.7668729737695256
FBeta: 0.736729212689692




In [24]:
log_coef = pd.DataFrame(logistic_regression.coef_, columns=X.columns)

log_coef = log_coef.melt()
log_coef

Unnamed: 0,variable,value
0,last_funding_amount,-3.188496e-08
1,number_of_founders,-3.12663e-12
2,number_of_investors,-7.086589e-12
3,number_of_lead_investors,-3.897882e-13
4,total_funding_amount,2.991418e-08
5,artificial_intelligence,-2.077445e-13
6,biotechnology,-9.359788e-15
7,data_and_analytics,-2.893374e-13
8,health_care,-1.670524e-13
9,science_and_engineering,-2.068581e-13


#### Random Forest

In [25]:
random_forest_param = {'n_estimators': [4,5,6,7,8, 9], 
              'max_features': ['log2', 'sqrt','auto',None],
              'criterion': ['entropy', 'gini'], 
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8], 'class_weight' : ['balanced','balanced_subsample', None]}


In [26]:
random_forest = RandomForestClassifier(n_jobs = -1)
rand_search = RandomizedSearchCV(random_forest, random_forest_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)


0.9859727882935749
{'n_estimators': 6, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 10, 'criterion': 'entropy', 'class_weight': None}


In [27]:
random_forest = RandomForestClassifier(n_estimators= 9, min_samples_split=2
                                       ,min_samples_leaf=1, max_features= None, max_depth= 10
                                       , criterion= 'entropy'
                                       ,class_weight='balanced_subsample')

random_forest.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = random_forest.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.9821937321937322
Recall Score: 0.9929328621908127
F1 Score: 0.9825174825174825
FBeta: 0.9823847120094136


In [28]:
feat_importances = pd.Series(random_forest.feature_importances_, index=X.columns)

feat_import = pd.DataFrame(random_forest.feature_importances_, index=X.columns)


In [29]:
feat_import.sort_values(0, ascending = False)

Unnamed: 0,0
total_funding_amount,0.536139
advertising,0.073845
number_of_investors,0.061834
days_since_founded,0.037323
software,0.034311
information_technology,0.030977
biotechnology,0.03081
number_of_lead_investors,0.023072
health_care,0.021152
days_since_last_funding,0.01981


#### SVC Model

In [201]:
svc_model_param = {'kernel':['linear', 'poly', 'rbf','sigmoid']
                   ,'gamma':[0.1, 1, 10, 100]
                   ,'C':[0.1, 1, 10, 100, 1000]
                   , 'degree':[0, 1, 2, 3, 4, 5, 6]}


In [202]:
svc_model = svm.SVC()
rand_search = RandomizedSearchCV(svc_model, svc_model_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

In [203]:
svc_model = svm.SCV()

scv_model.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = scv_model.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")