In [37]:
# Import everything

import sys
import pandas as pd
import numpy as np


import sklearn
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import xgboost
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import pickle
import joblib


In [4]:
df = pd.read_csv('KickstarterCleanedv4.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,backers_count,category,goal,pledged,spotlight,state,blurb_length,goal_in_usd,campaign_duration,sub_category
0,3692,128,publishing,4250.0,4718.0,1,1,17.0,5770.03,40,zines
1,3721,0,publishing,5000.0,0.0,0,0,22.0,3804.7,30,zines
2,3751,1,publishing,1500.0,25.0,0,0,20.0,1705.15,30,zines
3,3798,2,publishing,4000.0,120.0,0,0,19.0,5371.42,60,zines
4,3863,0,publishing,10.0,0.0,0,0,16.0,9.15,30,zines


In [5]:
df.drop(columns=['Unnamed: 0','goal','spotlight'],inplace=True)

In [6]:
df.columns

Index(['backers_count', 'category', 'pledged', 'state', 'blurb_length',
       'goal_in_usd', 'campaign_duration', 'sub_category'],
      dtype='object')

In [7]:
df = df.drop_duplicates()

In [8]:
df.shape

(8317, 8)

In [23]:
df.to_csv('Kickstarter_FinalCleaned.csv')

In [9]:
# Extracting the target and feature matrix
target = 'state'
y = df[target]
X = df.drop(columns=target)

print(X.shape)
print(y.shape)

(8317, 7)
(8317,)


In [10]:
# Splitting into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(4990, 7) (3327, 7)
(4990,) (3327,)


In [11]:
#Baseline

print('baseline accuracy', y.value_counts(normalize=True).max())


baseline accuracy 0.6059877359624865


In [12]:
#  Random Forest

model_rf = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       RandomForestClassifier( n_jobs=-1, random_state=42))

In [13]:
# Decision Tree
model_dt = make_pipeline(OrdinalEncoder(),
                      SimpleImputer(strategy="mean"),
                      DecisionTreeClassifier(random_state=42))

In [18]:
# XGBoost

model_xgb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       XGBClassifier(random_state=42))

In [19]:
# Gradient Boost

model_gb = make_pipeline(OrdinalEncoder(),
                       SimpleImputer(strategy="mean"),
                       GradientBoostingClassifier(random_state=42))

In [20]:
model_rf.fit(X_train,y_train)
model_dt.fit(X_train,y_train)
model_xgb.fit(X_train,y_train)
model_gb.fit(X_train,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': fashion          1
technology       2
film & video     3
art              4
food             5
design           6
publishing       7
theater          8
dance            9
comics          10
music           11
crafts          12
photography     13
NaN             -2
dtype: int64},
                                         {'col': 'sub_category',
                                          'data_type': dtype('O'),
                                          'mapping': footwear               1
robots                 2
thrillers              3
social practice        4
community gardens      5
                    ... 
restaurants          119
translations         120
literary journals    121
letterpress          122
N

In [21]:
#Check Metrics on training
print('model_dt accuracy score', accuracy_score(y_train, model_dt.predict(X_train)))
print('model_rf accuracy score', accuracy_score(y_train, model_rf.predict(X_train)))
print('model_xgb accuracy score', accuracy_score(y_train, model_xgb.predict(X_train)))
print('model_gb accuracy score', accuracy_score(y_train, model_gb.predict(X_train)))

model_dt accuracy score 1.0
model_rf accuracy score 0.9997995991983968
model_xgb accuracy score 1.0
model_gb accuracy score 0.9849699398797596


In [22]:
# Metrics with test data
# print('model_dt accuracy score', accuracy_score(y_test, model_dt.predict(X_test)))
# print('model_rf accuracy score', accuracy_score(y_test, model_rf.predict(X_test)))
# print('model_xgb accuracy score', accuracy_score(y_test, model_xgb.predict(X_test)))
# print('model_gb accuracy score', accuracy_score(y_test, model_gb.predict(X_test)))

model_dt accuracy score 0.9690411782386534
model_rf accuracy score 0.974150886684701
model_xgb accuracy score 0.9846708746618575
model_gb accuracy score 0.9774571686203787


In [28]:
# saving models using pickle
saved_model_rf = pickle.dumps(model_rf)
saved_model_xgb = pickle.dumps(model_xgb)


In [36]:

joblib_file = "joblib_RF_Model.pkl"  
joblib.dump(model_rf, 'assets/model_rf')


['assets/model_rf']

In [35]:
joblib_file = "joblib_XGB_Model.pkl"  
joblib.dump(model_xgb, 'assets/model_xgb')

['assets/model_xgb']

In [38]:
#Testing if model saved and working correctly
# # Load from file
# load_xgb_model = joblib.load('assets/model_xgb')
# load_xgb_model



Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['category', 'sub_category'],
                                mapping=[{'col': 'category',
                                          'data_type': dtype('O'),
                                          'mapping': fashion          1
technology       2
film & video     3
art              4
food             5
design           6
publishing       7
theater          8
dance            9
comics          10
music           11
crafts          12
photography     13
NaN             -2
dtype: int64},
                                         {'col': 'sub_category',
                                          'data_type': dtype('O'),
                                          'mapping': footwear               1
robots                 2
thrillers              3
so...
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_c

In [39]:
# # Use the Reloaded Joblib Model to 
# # Calculate the accuracy score and predict target values

# # Calculate the Score 
# score = load_xgb_model.score(X_test, y_test)  
# # Print the Score
# print("Test score: {0:.2f} %".format(100 * score))  

# # Predict the Labels using the reloaded Model
# Ypredict = load_xgb_model.predict(X_test)  

# Ypredict

Test score: 98.47 %


array([1, 1, 1, ..., 0, 0, 1])