# This is going to be the final notebook to present all findings

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

import src.visualization as vs
import src.process as process
import models.lib as modlib

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score, recall_score, precision_score
from sklearn.metrics import classification_report, roc_curve, confusion_matrix

# models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# ignore all warnings
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier

RSEED = 42

In [3]:
data = pd.read_csv('data/processed/kickstarter_clean.csv')

In [4]:
vs.nice_summary(data)

Unnamed: 0,Columns,Dtype,nunique,Non-Null Count,Missing,Missing %,Zero Count,mean,std,min,25%,50%,75%,max
0,backers_count,int64,3206,168979,-,-,14821,140.07,894.93,0.0,3.0,25.0,85.0,105857.0
1,country,object,22,168979,-,-,0,-,-,-,-,-,-,-
2,staff_pick,bool,2,168979,-,-,147323,-,-,-,-,-,-,-
3,state,int64,2,168979,-,-,74200,0.56,0.5,0.0,0.0,1.0,1.0,1.0
4,usd_pledged,float64,69802,168979,-,-,14821,12110.97,83739.29,0.0,100.0,1457.0,6235.0,8596474.58
5,usd_type,object,2,168959,20,0.01,0,-,-,-,-,-,-,-
6,converted_goal,float64,45238,168979,-,-,0,41096.88,1110956.93,0.01,1500.0,5000.0,13297.7,152350076.0
7,len_blurb,float64,35,168977,2,-,0,19.01,4.98,1.0,16.0,20.0,22.0,35.0
8,len_name,int64,20,168979,-,-,0,5.71,2.71,1.0,4.0,6.0,8.0,27.0
9,launch_to_deadline,int64,47963,168979,-,-,0,2821257.19,1015877.11,86400.0,2588400.0,2592000.0,2948818.5,8044324.0


In [5]:
data = data.drop(['usd_pledged','backers_count'], axis=1)
data.sample(5, random_state=RSEED)

Unnamed: 0,country,staff_pick,state,usd_type,converted_goal,len_blurb,len_name,launch_to_deadline,creation_to_launch,month,weekday,day_hour,category_name,parent_category
10790,US,False,0,domestic,50.0,24.0,4,1814400,230715,July,Monday,16,Small Batch,food
26459,US,False,0,domestic,5000.0,20.0,5,1209600,1677421,April,Wednesday,6,Classical Music,music
168147,US,False,0,international,2500.0,17.0,7,1325705,81507200,August,Monday,15,Poetry,publishing
32580,US,False,0,international,5000.0,17.0,3,2592000,1972320,April,Friday,21,Web,journalism
156442,US,False,1,international,6000.0,19.0,1,2592000,4594741,April,Monday,20,Music,music


## Preprocessing
- Performing train-test-split
- defining target variable = 'state' -> binary success/failure
- splitting categories up in numerical and categorical features for separate preprocessing
- load models from /models/lib.py
- Applying StandardScaler() to numerical features and imputing them with median() where necessary using SimpleImpute() -> not necessary in this dataset
- Applying OneHotEncoding() to categorical features and dropping the first. Imputing with "missing" where necessary using SimpleImpute() -> not necessary in this dataset

In [6]:
y = data['state']
X = data.drop('state', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = RSEED)

target = 'state'
num_features, cat_features = process.num_cat_features(data, target=target)
models = modlib.models()

preprocessor = process.create_preprocessor(num_features, cat_features)
scaled_models = process.model_process_pipeline(models, preprocessor, prefix='scaled')

Categorical Features: ['country', 'staff_pick', 'usd_type', 'month', 'weekday', 'category_name', 'parent_category'] 
Numerical Features: ['converted_goal', 'len_blurb', 'len_name', 'launch_to_deadline', 'creation_to_launch', 'day_hour']


## Cross Validation
- to get a rough idea which models might perform best

In [None]:
predicted_y_dict = process.model_cv_scores(X_train, y_train, scaled_models, kfolds=10, RSEED=42, shuffle=True, n_jobs=-1)
display(vs.nice_scores(y_train, predicted_y_dict))

## Predicting y_test using the above models

In [18]:
predictes_y_test_dict = process.model_test_predict(X_train, X_test, y_train, scaled_models)
display(vs.nice_scores(y_test, predictes_y_test_dict))

scaledDT - Time taken: 11.71 seconds
scaledRFC - Time taken: 198.89 seconds
scaledXGB - Time taken: 3.79 seconds
scaledABC - Time taken: 4.43 seconds
scaledKNN - Time taken: 137.68 seconds
scaledLR - Time taken: 0.92 seconds


Unnamed: 0,FBeta,Accuracy,Recall,Precision
scaledDT,0.819,0.787,0.846,0.812
scaledRFC,0.859,0.84,0.894,0.851
scaledXGB,0.842,0.813,0.859,0.839
scaledABC,0.826,0.785,0.819,0.827
scaledKNN,0.806,0.765,0.81,0.805
scaledLR,0.834,0.789,0.806,0.842


## Running GridSearch() on the most promising models
 - RandomForestClassifier()
 - XGBoost()

In [31]:
nice_models = {mod: scaled_models[mod] for mod in ['scaledRFC','scaledXGB']}

scorer = make_scorer(fbeta_score,beta=0.5)

for model_name, model in nice_models.items():
    xgb_random = RandomizedSearchCV(estimator = model, param_distributions=modlib.random_grid(model_name), scoring=scorer, n_iter = 100, cv = 5, verbose=2, random_state=RSEED, n_jobs=-1)
    xgb_random.fit(X_train, y_train)



dict_keys(['memory', 'steps', 'verbose', 'scaled', 'RFC', 'scaled__n_jobs', 'scaled__remainder', 'scaled__sparse_threshold', 'scaled__transformer_weights', 'scaled__transformers', 'scaled__verbose', 'scaled__verbose_feature_names_out', 'scaled__num', 'scaled__cat', 'scaled__num__memory', 'scaled__num__steps', 'scaled__num__verbose', 'scaled__num__imputer_num', 'scaled__num__std_scaler', 'scaled__num__imputer_num__add_indicator', 'scaled__num__imputer_num__copy', 'scaled__num__imputer_num__fill_value', 'scaled__num__imputer_num__missing_values', 'scaled__num__imputer_num__strategy', 'scaled__num__imputer_num__verbose', 'scaled__num__std_scaler__copy', 'scaled__num__std_scaler__with_mean', 'scaled__num__std_scaler__with_std', 'scaled__cat__memory', 'scaled__cat__steps', 'scaled__cat__verbose', 'scaled__cat__imputer_cat', 'scaled__cat__1hot', 'scaled__cat__imputer_cat__add_indicator', 'scaled__cat__imputer_cat__copy', 'scaled__cat__imputer_cat__fill_value', 'scaled__cat__imputer_cat__miss

dict_keys(['memory', 'steps', 'verbose', 'scaled', 'XGB', 'scaled__n_jobs', 'scaled__remainder', 'scaled__sparse_threshold', 'scaled__transformer_weights', 'scaled__transformers', 'scaled__verbose', 'scaled__verbose_feature_names_out', 'scaled__num', 'scaled__cat', 'scaled__num__memory', 'scaled__num__steps', 'scaled__num__verbose', 'scaled__num__imputer_num', 'scaled__num__std_scaler', 'scaled__num__imputer_num__add_indicator', 'scaled__num__imputer_num__copy', 'scaled__num__imputer_num__fill_value', 'scaled__num__imputer_num__missing_values', 'scaled__num__imputer_num__strategy', 'scaled__num__imputer_num__verbose', 'scaled__num__std_scaler__copy', 'scaled__num__std_scaler__with_mean', 'scaled__num__std_scaler__with_std', 'scaled__cat__memory', 'scaled__cat__steps', 'scaled__cat__verbose', 'scaled__cat__imputer_cat', 'scaled__cat__1hot', 'scaled__cat__imputer_cat__add_indicator', 'scaled__cat__imputer_cat__copy', 'scaled__cat__imputer_cat__fill_value', 'scaled__cat__imputer_cat__miss

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaled',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer_num',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('std_scaler',
                                                                                                StandardScaler())]),
                                                                               ['converted_goal',
                                                                                'len_blurb',
                                                                                'len_name',
                                                                                '

In [6]:
nice_models = {mod: scaled_models[mod] for mod in ['scaledRFC','scaledXGB']}
scorer = make_scorer(fbeta_score,beta=0.5)

best_models = process.model_selection_search(X_train, y_train, nice_models,
                                              modlib.random_grids(), search_method=RandomizedSearchCV,
                                              cv=5, scoring=scorer, n_jobs=-1, random_state=RSEED)


scaledRFC - Time taken: 7517.93 seconds
scaledXGB - Time taken: 37.35 seconds


In [8]:
for model_name, model in best_models.items():
    print(f"GridSearch results for {model_name}:\n------")
    print('Best score:{:.2f}'.format(model.best_score_))
    print("Best parameters:{}".format(model.best_params_))
    best_predictions = model.best_estimator_.predict(X_test)
    print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
    print("------\n")

GridSearch results for bestscaledRFC:
------
Best score:0.85
Best parameters:{'RFC__n_estimators': 1800, 'RFC__min_samples_split': 5, 'RFC__min_samples_leaf': 1, 'RFC__max_features': 'auto', 'RFC__max_depth': 100, 'RFC__bootstrap': True}
Final accuracy score on the testing data: 0.8382
Final F-score on the testing data: 0.8571
------

GridSearch results for bestscaledXGB:
------
Best score:0.84
Best parameters:{'XGB__reg_lambda': 0.01, 'XGB__reg_alpha': 1e-05, 'XGB__max_depth': 3, 'XGB__learning_rate': 1, 'XGB__gamma': 0.0, 'XGB__colsample_bytree': 0.4}
Final accuracy score on the testing data: 0.8116
Final F-score on the testing data: 0.8458
------



In [7]:
fbeta = round(fbeta_score(data.state, data.staff_pick, beta=0.5), 3)
accuracy = round(accuracy_score(data.state, data.staff_pick), 3)
recall = round(recall_score(data.state, data.staff_pick), 3)
precision = round(precision_score(data.state, data.staff_pick), 3)

scores = pd.DataFrame()    
scores['base model'] = [fbeta, accuracy, recall, precision]

# few false positives: staff_picked it, but it wasn't successful
# many false negatives: staff didn't pick it, but they were successful

display(scores)

# 13.5% are staff picked
data.query('staff_pick == True').state.value_counts()

Unnamed: 0,base model
0,0.535
1,0.502
2,0.202
3,0.912


1    23713
0     2280
Name: state, dtype: int64