In [49]:
import pandas as pd
import numpy as np
import category_encoders as ce
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# suppress warning messages
import warnings
warnings.filterwarnings("ignore")

In [50]:
df = pd.read_csv(r"https://raw.githubusercontent.com/JonathanBechtel/dat-02-22/main/ClassMaterial/Unit3/data/ks2.csv")

In [51]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,launched,state,country,goal
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 00:00:00,2015-08-11 12:12:28,0,GB,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01 00:00:00,2017-09-02 04:43:57,0,US,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:00:00,2013-01-12 00:20:50,0,US,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 00:00:00,2012-03-17 03:24:11,0,US,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 00:00:00,2015-07-04 08:35:03,0,US,19500.0


In [52]:
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])

In [54]:
df['launched_month']   = df['launched'].dt.month
df['launched_day']     = df['launched'].dt.day
df['launched_year']    = df['launched'].dt.year
df['launched_quarter'] = df['launched'].dt.quarter
df['campaign_duration']= (df['deadline'] - df['launched']).dt.days

In [55]:
#helper functions
def split_data(df, split_frac=0.2, random_state=42):
    df = df.drop(['deadline', 'launched'], axis = 1)#remember to drop date columns
    X  = df.drop('state', axis=1)
    y  = df['state']
    # notice the use of 'stratify' -- makes sure y values are in equal proportions in train + test important for classification data sets
    return train_test_split(X, y, test_size = split_frac, stratify = y, random_state = random_state)

# helper function to pull out feature importances_
def get_feature_importances(pipe, X_train, onehot=False):
    if onehot:
        X_train = pipe[0].transform(X_train)
        X_train = pipe[1].transform(X_train)
    return pd.DataFrame({
        'Col': X_train.columns,
        'Importance': pipe[-1].feature_importances_
    }).sort_values(by='Importance', ascending=False)

In [56]:
def get_model_scores(mod, X_train, y_train, X_test, y_test, val_score = True, test_score=False):
    if val_score:
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                          test_size = 0.2, 
                                                          stratify = y_train, 
                                                          random_state= 42)
 
    mod.fit(X_train, y_train)
    
    results = {}
    
    results['train_score'] = mod.score(X_train, y_train)
    if val_score:
        results['val_score'] = mod.score(X_val, y_val)
        
    if test_score:
        results['test_score'] = mod.score(X_test, y_test)
        
    return results

In [57]:
#split data
X_train, X_test, y_train, y_test = split_data(df)

te = ce.TargetEncoder(min_samples_leaf = 30)

mod = XGBClassifier(eval_metric = 'logloss')

pipe = make_pipeline(te, mod)

scores = get_model_scores(pipe, X_train, y_train, X_test, y_test, test_score = True)

scores

{'train_score': 0.701446708001181,
 'val_score': 0.6994921802507044,
 'test_score': 0.7019071142243998}

In [58]:
#val score says cclassifier will get the right answer about 68.5% of the time

In [59]:
feats = get_feature_importances(pipe, X_train)
feats

Unnamed: 0,Col,Importance
6,goal,0.283046
11,campaign_duration,0.221264
2,category,0.196839
9,launched_year,0.106322
3,main_category,0.071839
5,country,0.043103
7,launched_month,0.041667
8,launched_day,0.020115
4,currency,0.011494
0,ID,0.00431


In [60]:
ore = ce.OrdinalEncoder()

pipe = make_pipeline(ore, mod)

scores = get_model_scores(pipe, X_train, y_train, X_test, y_test)

scores

{'train_score': 0.6929984394111941, 'val_score': 0.6898081757292528}

In [61]:
feats = get_feature_importances(pipe, X_train)
feats

Unnamed: 0,Col,Importance
6,goal,0.22
2,category,0.211429
3,main_category,0.204286
11,campaign_duration,0.16
9,launched_year,0.117143
4,currency,0.031429
5,country,0.017143
7,launched_month,0.015714
8,launched_day,0.015714
0,ID,0.004286


In [62]:
ohe = ce.OneHotEncoder(cols = ['category', 'country', 'currency'], use_cat_names = True)
te  = ce.TargetEncoder()

pipe = make_pipeline(ore, te, mod)

scores = get_model_scores(pipe, X_train, y_train, X_test, y_test)

scores

{'train_score': 0.6929984394111941, 'val_score': 0.6898081757292528}

In [63]:
feats = get_feature_importances(pipe, X_train, onehot = True)
feats

Unnamed: 0,Col,Importance
6,goal,0.22
2,category,0.211429
3,main_category,0.204286
11,campaign_duration,0.16
9,launched_year,0.117143
4,currency,0.031429
5,country,0.017143
7,launched_month,0.015714
8,launched_day,0.015714
0,ID,0.004286
