# This is going to be the final notebook to present all findings

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

import src.visualization as vs
import src.process as process
import models.lib as modlib

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score, recall_score, precision_score
from sklearn.metrics import classification_report, roc_curve, confusion_matrix

# models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# ignore all warnings
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier

RSEED = 42

In [37]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')

In [38]:
vs.nice_summary(data)

Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,country,object,22,168979,-,-,0,-,-,-,-,-,-,-
1,staff_pick,bool,2,168979,-,-,147323,-,-,-,-,-,-,-
2,state,int64,2,168979,-,-,74200,0.56,0.5,0.0,0.0,1.0,1.0,1.0
3,usd_pledged,float64,69803,168979,-,-,14821,12110.97,83739.32,0.0,100.0,1457.0,6235.0,8596474.58
4,usd_type,object,2,168959,20,0.01,0,-,-,-,-,-,-,-
5,converted_goal,float64,45238,168979,-,-,0,41096.88,1110956.93,0.01,1500.0,5000.0,13297.7,152350076.0
6,pledge_per_backer,float64,25712,168979,-,-,14821,71.83,136.32,0.0,20.25,46.0,83.73,10000.0
7,len_blurb,float64,35,168977,2,-,0,19.01,4.98,1.0,16.0,20.0,22.0,35.0
8,len_name,int64,20,168979,-,-,0,5.71,2.71,1.0,4.0,6.0,8.0,27.0
9,category_name,object,159,168979,-,-,0,-,-,-,-,-,-,-


In [29]:
data['day_hour_launch'] = data['day_hour_launch'].astype(str)
data['day_hour_deadline'] = data['day_hour_deadline'].astype(str)

data = data.drop(['staff_pick','usd_pledged','pledge_per_backer'], axis=1)
data = data.drop(['month_launch','weekday_launch','day_hour_launch'], axis=1)
data = data.drop(['month_deadline','weekday_deadline','day_hour_deadline'], axis=1)
data = data.drop(['category_name', 'parent_category'], axis=1)
data.sample(5, random_state=RSEED)

Unnamed: 0,country,state,usd_type,converted_goal,len_blurb,len_name,launch_to_deadline,creation_to_launch
10790,US,1,international,2000.0,8.0,8,2694818,1742779
26459,CA,1,domestic,4743.59,13.0,7,2592000,349404
168147,US,1,domestic,4000.0,18.0,7,3024000,16619516
32580,US,0,international,35000.0,17.0,2,2592000,6379447
156442,US,0,domestic,4000.0,26.0,4,5130896,1349404


## Preprocessing
- Performing train-test-split
- defining target variable = 'state' -> binary success/failure
- splitting categories up in numerical and categorical features for separate preprocessing
- load models from /models/lib.py
- Applying StandardScaler() to numerical features and imputing them with median() where necessary using SimpleImpute() -> not necessary in this dataset
- Applying OneHotEncoding() to categorical features and dropping the first. Imputing with "missing" where necessary using SimpleImpute() -> not necessary in this dataset

In [35]:
y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = modlib.models()

preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')
xgb_scaled = process.model_process_pipeline({'XGB':  XGBClassifier(seed=RSEED)}, preprocessor, prefix='scaled')

Categorical Features: ['country', 'usd_type'] 
Numerical Features: ['converted_goal', 'len_blurb', 'len_name', 'launch_to_deadline', 'creation_to_launch']


## Testing a single Model

In [60]:
data2 = pd.read_csv('data/processed/kickstarter_clean.csv')

data2['day_hour_launch'] = data2['day_hour_launch'].astype(str)
data2['day_hour_deadline'] = data2['day_hour_deadline'].astype(str)

data2 = data2.drop(['staff_pick','usd_pledged','pledge_per_backer'], axis=1)
#data2 = data2.drop(['month_launch','weekday_launch','day_hour_launch'], axis=1)
#data2 = data2.drop(['month_deadline','weekday_deadline','day_hour_deadline'], axis=1)
#data2 = data2.drop(['category_name', 'parent_category'], axis=1)

y = data2['state']
X = data2.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

target = 'state'
num_features, cat_features = process.num_cat_features(data2, target=target)

preprocessor = process.create_preprocessor(num_features, cat_features)
xgb_scaled = process.model_process_pipeline({'XGB':  XGBClassifier(seed=RSEED)}, preprocessor, prefix='scaled')

xgb_predictions_dict = process.model_test_predict(X_train, X_test, y_train, xgb_scaled)
display(vs.nice_scores(y_test, xgb_predictions_dict))

Categorical Features: ['country', 'usd_type', 'category_name', 'parent_category', 'month_launch', 'weekday_launch', 'day_hour_launch', 'month_deadline', 'weekday_deadline', 'day_hour_deadline'] 
Numerical Features: ['converted_goal', 'len_blurb', 'len_name', 'launch_to_deadline', 'creation_to_launch']
scaledXGB - Time taken: 3.75 seconds


Unnamed: 0,FBeta,Accuracy,Recall,Precision
scaledXGB,0.817,0.795,0.817,0.817


## Cross Validation
- to get a rough idea which models might perform best

In [22]:
predicted_y_dict = process.model_cv_scores(X_train, y_train, scaled_models, kfolds=10, RSEED=RSEED, shuffle=True, n_jobs=1)
display(vs.nice_scores(y_train, predicted_y_dict))

KeyboardInterrupt: 

## Predicting y_test using the above models

In [5]:
predictes_y_test_dict = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

scaledDT - Time taken: 1.73 seconds
scaledRFC - Time taken: 96.43 seconds
scaledXGB - Time taken: 3.65 seconds
scaledABC - Time taken: 7.39 seconds
scaledLR - Time taken: 1.09 seconds


Unnamed: 0,FBeta,Accuracy,Recall,Precision
scaledDT,0.997,0.998,0.999,0.997
scaledRFC,0.982,0.986,0.997,0.978
scaledXGB,0.999,0.999,1.0,0.998
scaledABC,0.977,0.983,0.998,0.972
scaledLR,0.921,0.895,0.875,0.933


## Running GridSearch() on the most promising models
 - RandomForestClassifier()
 - XGBoost()

In [7]:
nice_models = {mod: scaled_models[mod] for mod in ['scaledXGB']}
scorer = make_scorer(fbeta_score,beta=0.5)

best_models = process.model_selection_search(X_train, y_train, nice_models,
                                              modlib.random_grids(), search_method=RandomizedSearchCV,
                                              cv=5, scoring=scorer, n_iter = 100, random_state=RSEED, verbose=2, n_jobs=1)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.3, XGB__gamma=0.0, XGB__learning_rate=0.1, XGB__max_depth=18, XGB__reg_alpha=100, XGB__reg_lambda=0.1; total time=   4.2s
[CV] END XGB__colsample_byt

In [8]:
for model_name, model in best_models.items():
    print(f"GridSearch results for {model_name}:\n------")
    print('Best score:{:.2f}'.format(model.best_score_))
    print("Best parameters:{}".format(model.best_params_))
    best_predictions = model.best_estimator_.predict(X_test)
    print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
    print("Final recall score on the testing data: {:.4f}".format(recall_score(y_test, best_predictions)))
    print("Final precision on the testing data: {:.4f}".format(precision_score(y_test, best_predictions)))
    print("------\n")

GridSearch results for bestscaledXGB:
------
Best score:0.82
Best parameters:{'XGB__reg_lambda': 100, 'XGB__reg_alpha': 1, 'XGB__max_depth': 9, 'XGB__learning_rate': 1, 'XGB__gamma': 0.0, 'XGB__colsample_bytree': 0.6}
Final accuracy score on the testing data: 0.7950
Final F-score on the testing data: 0.8192
------



In [10]:
nice_models = {mod: scaled_models[mod] for mod in ['scaledXGB']}

best_models = process.model_selection_search(X_train, y_train, nice_models,
                                              modlib.random_grids(), search_method=RandomizedSearchCV,
                                              cv=5, scoring='accuracy', n_iter = 100, random_state=RSEED, verbose=2, n_jobs=1)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.5, XGB__gamma=0.4, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__reg_alpha=10, XGB__reg_lambda=1; total time=   1.3s
[CV] END XGB__colsample_bytree=0.3, XGB__gamma=0.0, XGB__learning_rate=0.1, XGB__max_depth=18, XGB__reg_alpha=100, XGB__reg_lambda=0.1; total time=   4.2s
[CV] END XGB__colsample_byt

In [69]:
fbeta = round(fbeta_score(data.state, data.staff_pick, beta=0.5), 3)
accuracy = round(accuracy_score(data.state, data.staff_pick), 3)
recall = round(recall_score(data.state, data.staff_pick), 3)
precision = round(precision_score(data.state, data.staff_pick), 3)

scores = pd.DataFrame()    
scores['base model'] = [fbeta, accuracy, recall, precision]

# few false positives: staff_picked it, but it wasn't successful
# many false negatives: staff didn't pick it, but they were successful

display(scores)

# 13.5% are staff picked
display(data.query('staff_pick == True').state.value_counts())

display(data.shape)

Unnamed: 0,base model
0,0.535
1,0.541
2,0.205
3,0.896


1    19404
0     2252
Name: state, dtype: int64

(168979, 19)