In [34]:
############################# Import Library ############################# 

import pandas as pd
import numpy as np
from scipy.sparse import hstack
import datetime
from sklearn import model_selection as ms
from sklearn.preprocessing import StandardScaler, CategoricalEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score, f1_score, accuracy_score, roc_auc_score, precision_score
from sklearn.dummy import DummyClassifier

In [2]:
############################# Import Data & Cleasing ############################# 

raw_df = pd.read_csv('ks-projects-201801.csv')
raw_df['launched'] = pd.to_datetime(raw_df['launched'])  
raw_df['deadline'] = pd.to_datetime(raw_df['deadline']) 
#raw_df.head()
raw_df = raw_df.loc[raw_df['launched'] < datetime.date(2018,1,1)]
raw_df = raw_df.loc[raw_df['launched'] > datetime.date(2009,12,31)]
raw_df = raw_df.loc[raw_df['launched'] < raw_df['deadline']]
raw_df = raw_df.loc[raw_df['usd_goal_real'] > 0]
raw_df = raw_df.loc[raw_df['state'].isin(['failed','canceled','suspended','successful'])]
raw_df['period'] = (raw_df['deadline'] - raw_df['launched']).astype('timedelta64[D]')
d = {'successful': True, 'failed': False, 'canceled': False, 'suspended': False}
raw_df['successful'] = raw_df['state'].map(d)

#raw_df = raw_df[['successful','country','category','usd_goal_real','period']]
#raw_df.head()

In [3]:
############################# Test & Training split ############################# 

oot_df = raw_df.loc[raw_df['launched'] > datetime.date(2016,12,31)]
oot_df = oot_df[['successful','country','category','usd_goal_real','period']]
model_df = raw_df.loc[raw_df['launched'] < datetime.date(2017,1,1)]
model_df = model_df[['successful','country','category','usd_goal_real','period']]

oot_dev_x, oot_test_x, oot_dev_y, oot_test_y = ms.train_test_split(oot_df.drop(['successful'], axis=1), oot_df['successful'], test_size=0.5, random_state=123)
train_x, test_x, train_y, test_y = ms.train_test_split(model_df.drop(['successful'], axis=1), model_df['successful'], test_size=0.3, random_state=456)


In [4]:
##############################################################################
############################# Logi Model fitting #############################
##############################################################################
scaler = StandardScaler()
enc = CategoricalEncoder(handle_unknown='ignore')
scaler.fit(train_x[['usd_goal_real','period']])
enc.fit(train_x[['country','category']])

CategoricalEncoder(categories='auto', dtype=<class 'numpy.float64'>,
          encoding='onehot', handle_unknown='ignore')

In [5]:
scaled_train_x = hstack((
    enc.transform(train_x[['country','category']]),
    scaler.transform(train_x[['usd_goal_real','period']])
))

In [8]:
dummymodel = DummyClassifier(random_state = 8764)
dummymodel.fit(scaled_train_x,train_y)
#rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124)
kf = KFold(n_splits=10, shuffle = True, random_state = 2652124)
logmodelcv = LogisticRegressionCV(
    Cs = 50,
    fit_intercept = True,
    cv = kf,
    penalty = 'l2',
    scoring = 'roc_auc',
    random_state = 8764,
    max_iter = 1000,
    n_jobs = 4
)

In [None]:
logmodelcv.fit(scaled_train_x,train_y)

In [13]:
from sklearn.externals import joblib
#joblib.dump(logmodelcv, 'logmodelcv_20180414.pkl')
#joblib.dump(scaler, 'logiscaler_20180414.pkl')
#joblib.dump(enc, 'logienc_20180414.pkl')

logmodelcv = joblib.load('logmodelcv_20180414.pkl') 
scaler = joblib.load('logiscaler_20180414.pkl')
enc = joblib.load('logienc_20180414.pkl')

In [14]:
logmodelcv.score(scaled_train_x,train_y),dummymodel.score(scaled_train_x,train_y)

(0.6889799473796487, 0.5403452321695229)

In [15]:
############################# OOS Testing ############################# 
scaled_test_x = hstack((
    enc.transform(test_x[['country','category']]),
    scaler.transform(test_x[['usd_goal_real','period']])
))

In [16]:
logmodelcv.score(scaled_test_x,test_y),dummymodel.score(scaled_test_x,test_y)

(0.6915267595117854, 0.5433409724886709)

In [17]:
############################# OOT Testing ############################# 
scaled_ootdev_x = hstack((
    enc.transform(oot_dev_x[['country','category']]),
    scaler.transform(oot_dev_x[['usd_goal_real','period']])
))

In [18]:
logmodelcv.score(scaled_ootdev_x,oot_dev_y),dummymodel.score(scaled_ootdev_x,oot_dev_y)

(0.6742393672827052, 0.5313937535307884)

In [19]:
############################# cohen_kappa_score #############################
cohen_kappa_score(train_y,logmodelcv.predict(scaled_train_x)),cohen_kappa_score(train_y,dummymodel.predict(scaled_train_x))

(0.2630430442558327, -0.0002528663294800726)

In [20]:
cohen_kappa_score(test_y,logmodelcv.predict(scaled_test_x)),cohen_kappa_score(test_y,dummymodel.predict(scaled_test_x))

(0.26768574601223094, 0.005876974511747068)

In [21]:
cohen_kappa_score(oot_dev_y,logmodelcv.predict(scaled_ootdev_x)),cohen_kappa_score(oot_dev_y,dummymodel.predict(scaled_ootdev_x))

(0.2310030782080431, -0.007896910740168073)

In [22]:
############################# f1_score #############################
f1_score(train_y,logmodelcv.predict(scaled_train_x)),f1_score(train_y,dummymodel.predict(scaled_train_x))

(0.46693377614600634, 0.3576024844720497)

In [23]:
f1_score(test_y,logmodelcv.predict(scaled_test_x)),f1_score(test_y,dummymodel.predict(scaled_test_x))

(0.47015656448711335, 0.3612178882780429)

In [24]:
f1_score(oot_dev_y,logmodelcv.predict(scaled_ootdev_x)),f1_score(oot_dev_y,dummymodel.predict(scaled_ootdev_x))

(0.4304761904761905, 0.362378520836765)

In [25]:
############################# roc_auc_score #############################
roc_auc_score(train_y,logmodelcv.predict(scaled_train_x)),roc_auc_score(train_y,dummymodel.predict(scaled_train_x))

(0.6207501184272698, 0.4998736057130218)

In [26]:
roc_auc_score(test_y,logmodelcv.predict(scaled_test_x)),roc_auc_score(test_y,dummymodel.predict(scaled_test_x))

(0.6229049693399552, 0.5029425910536589)

In [27]:
roc_auc_score(oot_dev_y,logmodelcv.predict(scaled_ootdev_x)),roc_auc_score(oot_dev_y,dummymodel.predict(scaled_ootdev_x))

(0.6045813774860254, 0.49607615556713214)

In [None]:
precision_score

In [36]:
precision_score(oot_dev_y,logmodelcv.predict(scaled_ootdev_x)),precision_score(oot_dev_y,dummymodel.predict(scaled_ootdev_x))

(0.6193666260657734, 0.36813922356091033)

In [33]:
logmodelcv.Cs_,logmodelcv.C_

(array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
        4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
        2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
        9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
        4.09491506e-02, 5.96362332e-02, 8.68511374e-02, 1.26485522e-01,
        1.84206997e-01, 2.68269580e-01, 3.90693994e-01, 5.68986603e-01,
        8.28642773e-01, 1.20679264e+00, 1.75751062e+00, 2.55954792e+00,
        3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
        1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
        7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
        3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
        1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
        6.86648845e+03, 1.00000000e+04]), array([4714.86636346]))