import stuff (a lot of stuff) & read in data

In [7]:
import pandas as pd
import numpy as np
import category_encoders as ce
import datetime as dt
pd.options.mode.chained_assignment = None
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
pd.options.display.max_rows = 1000
pd.options.display.max_columns= 1000
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [8]:
data = pd.read_csv('../datasets/ks-projects-201801.csv')

In [9]:
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


# data cleaning / transforming

create a new column denoting length of project name (number of characters)

In [10]:
data['titleLength'] = data['name'].apply(lambda x: len(str(x)))

drop columns that won't be used at the moment (main_category, ID, name, state, pledged, usd pledged, goal)

In [11]:
data.drop(columns=['main_category','ID','name','state','pledged','usd pledged','goal'],inplace=True)

format date columns properly and get rid of records with invalid dates (precede the year 2000)

In [12]:
data['launched'] = pd.to_datetime(data['launched'])
data['deadline'] = pd.to_datetime(data['deadline'])
df = data[data['launched']>'2000-01-01']

eliminate projects with usd pledged real > 20000 or usd pledged real = 0

TODO: write a brief explanation as to why you are doing this (double-click on this cell to edit the text)

In [13]:
df = data[(data['usd_pledged_real']<20000) & (data['usd_pledged_real']>0)]

create logged versions of usd pledged real and usd goal real + drop the original nonlogged versions

TODO: write a brief explanation as to why you are doing this



In [14]:
df['logPledged'] = np.log(df['usd_pledged_real'])
df['logGoal'] = np.log(df['usd_goal_real'])

df.drop(columns=['usd_goal_real','usd_pledged_real'], inplace=True)

create columns to encode the 'launched' column -- I chose to break it down into month, day of week, and hour

In [15]:
df['launchMonth'] = df['launched'].dt.month
df['launchDay'] = df['launched'].dt.dayofweek
df['launchHour'] = df['launched'].dt.hour

calculate the duration of each project campaign + drop the launched & deadline columns

In [16]:
df['duration'] = (df['deadline']-df['launched'])/dt.timedelta(minutes=1)
df.drop(columns=['deadline','launched'],inplace=True)

# model preparation

divide dataset into features (X) and target (y)

In [17]:
X = df.drop(columns=['logPledged'])
y = df['logPledged']

prepare the pipeline (for help, see bottom of this article https://kiwidamien.github.io/encoding-categorical-variables.html)


In [18]:
#some helper code for you
import joblib
location = 'cache'
memory = joblib.Memory(location=location, verbose=10)

encoding_pipeline = Pipeline([
    ('encode_category', ce.HashingEncoder(cols=['category'], return_df=True)),
    ('encode_other', ce.OneHotEncoder(cols=['currency','country','launchMonth','launchDay','launchHour'], return_df=True)),
], memory=memory)

train-test-split your data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

encode your training set and test set (for reference, see documentation here https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

TODO: explain when to use fit_transform( ) vs transform( ) and why

In [20]:
X_train_encoded = encoding_pipeline.fit_transform(X_train, y_train)
X_test_encoded = encoding_pipeline.transform(X_test)

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(HashingEncoder(cols=['category'], drop_invariant=False, hash_method='md5',
               max_process=2, max_sample=0, n_components=8, return_df=True,
               verbose=0), 
              category currency  backers country  titleLength    logGoal  \
132449    Performances      USD       39      US           55   8.517193   
80239      Documentary      USD       89      US           37   8.699515   
281941  Product Design      USD        5      US           42  11.225243   
254197            Food      USD        9      US           17   8.517193   
303507  Tabletop Games      USD        8      US           15   9.510445   
...                ...      ...      ...     ...          ...        ...   
22070           Drinks      USD       23      US           22   8.699515   
313017  Product Design      USD       29      US         

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 195.2s, 3.3min


check to make sure that the dimensions align between test & training

In [21]:
print(X_train_encoded.shape)
print(X_test_encoded.shape)

(239189, 92)
(59798, 92)


In [22]:
X_test_encoded.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,currency_1,currency_2,currency_3,currency_4,currency_5,currency_6,currency_7,currency_8,currency_9,currency_10,currency_11,currency_12,currency_13,currency_14,backers,country_1,country_2,country_3,country_4,country_5,country_6,country_7,country_8,country_9,country_10,country_11,country_12,country_13,country_14,country_15,country_16,country_17,country_18,country_19,country_20,country_21,country_22,country_23,titleLength,logGoal,launchMonth_1,launchMonth_2,launchMonth_3,launchMonth_4,launchMonth_5,launchMonth_6,launchMonth_7,launchMonth_8,launchMonth_9,launchMonth_10,launchMonth_11,launchMonth_12,launchDay_1,launchDay_2,launchDay_3,launchDay_4,launchDay_5,launchDay_6,launchDay_7,launchHour_1,launchHour_2,launchHour_3,launchHour_4,launchHour_5,launchHour_6,launchHour_7,launchHour_8,launchHour_9,launchHour_10,launchHour_11,launchHour_12,launchHour_13,launchHour_14,launchHour_15,launchHour_16,launchHour_17,launchHour_18,launchHour_19,launchHour_20,launchHour_21,launchHour_22,launchHour_23,launchHour_24,duration
0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,150,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,9.602382,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,42102.716667
1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,9.740969,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63591.333333
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,21,8.23194,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,77768.266667
3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,34,7.718685,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,43160.533333
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,33,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,8.987197,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,41817.366667


# linear regression

create & fit linear regression model on training set

In [33]:
linreg = LinearRegression()
linreg.fit(X_train_encoded,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

calculate & output MAE, MSE, RMSE based on test set AND interpret them in the context of this project

In [34]:
mae = metrics.mean_absolute_error(y_test,linreg.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, linreg.predict(X_test_encoded))

In [35]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  1.5432393024437199
mse:  3.892822526894883
rmse:  1.9730237015542624
r2:  0.32942781866418036


In [43]:
list(zip(X_train_encoded.columns, linreg.coef_))

[('col_0', 0.04558383950316178),
 ('col_1', -0.272015088120959),
 ('col_2', -0.24427716623720866),
 ('col_3', 0.0684485708659164),
 ('col_4', 0.2859418654124542),
 ('col_5', 0.1864497153756942),
 ('col_6', -0.09470994122875936),
 ('col_7', 0.02457820442885715),
 ('currency_1', 0.7626665189567149),
 ('currency_2', 4.326245774899373e-05),
 ('currency_3', -1.0603899457945862),
 ('currency_4', 0.5423838903692689),
 ('currency_5', -0.6183348903086006),
 ('currency_6', 0.48007248157555393),
 ('currency_7', -0.12422721139216601),
 ('currency_8', -0.36364437430914864),
 ('currency_9', 0.5164070678890118),
 ('currency_10', 0.08125595692414966),
 ('currency_11', 0.48877468831143844),
 ('currency_12', -0.6664143089922602),
 ('currency_13', -0.03970782136788448),
 ('currency_14', 0.0011146856812512262),
 ('backers', 0.016954072548968094),
 ('country_1', -0.769631162896362),
 ('country_2', -0.39506341862481986),
 ('country_3', 0.29785834889189333),
 ('country_4', -0.706418437106037),
 ('country_5',

# DTR / RF
I'm assuming you get the hang of things by now -- just repeat the process

*don't worry about using GridSearch at the moment

In [36]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_encoded,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [37]:
mae = metrics.mean_absolute_error(y_test,dtr.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,dtr.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,dtr.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, dtr.predict(X_test_encoded))

In [38]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.9389532433958205
mse:  1.9707283450846926
rmse:  1.4038263229775585
r2:  0.6605250827507203


In [39]:
rfr = RandomForestRegressor()
rfr.fit(X_train_encoded,y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [40]:
mae = metrics.mean_absolute_error(y_test,rfr.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,rfr.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,rfr.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, rfr.predict(X_test_encoded))

In [41]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.710467817728967
mse:  1.0928443407674713
rmse:  1.0453919555685662
r2:  0.8117481574395088


In [42]:
# compute feature importances
pd.DataFrame({'feature':X_test_encoded.columns, 
              'importance':rfr.feature_importances_}).sort_values(by='importance',ascending=False)

Unnamed: 0,feature,importance
22,backers,0.79021
47,logGoal,0.044444
91,duration,0.031752
28,country_6,0.023247
46,titleLength,0.021389
60,launchDay_1,0.002545
64,launchDay_5,0.002406
61,launchDay_2,0.002349
63,launchDay_4,0.002332
62,launchDay_3,0.002291


# using gridsearch with pipelines,  creating a custom scaler

In [45]:
#no need to TTS when using GridSearch
X = df.drop(columns=['logPledged'])
y = df['logPledged']

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [48]:
df.describe()

Unnamed: 0,backers,titleLength,logPledged,logGoal,launchMonth,launchDay,launchHour,duration
count,298987.0,298987.0,298987.0,298987.0,298987.0,298987.0,298987.0,298987.0
mean,42.005783,34.963704,6.26479,8.492713,6.437173,2.449829,13.029215,48232.831613
std,73.313672,15.900229,2.40832,1.626035,3.340454,1.771127,7.753882,18440.349669
min,0.0,1.0,-0.798508,-4.60517,1.0,0.0,0.0,7.283333
25%,3.0,22.0,4.70048,7.600902,4.0,1.0,5.0,41887.525
50%,15.0,34.0,6.729824,8.517193,6.0,2.0,16.0,42737.6
75%,50.0,49.0,8.168608,9.431401,9.0,4.0,20.0,52079.625
max,4015.0,96.0,9.903487,18.517533,12.0,6.0,23.0,132426.216667


## Standard Scaler
- standardize features by removing the mean and scaling to unit variance
- calculated as: z = (x - u) / s
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

### Implementation

In [None]:
scaler = StandardScaler()
scaler.fit(df)

- as you can see, the basic implementation doesn't really work here since it can only apply the scaler to the entire dataset, which contains categorical columns
- thus, we are going to wrap it into a function that allows us to apply the scaler to specific functions

### ColumnTransformer
- applies transformers to columns of an array or dataframe
- allows different columns to be transformed separately, which is useful for heterogeneous data

In [None]:
#why we need to wrap this into a function
scaler = ColumnTransformer([
            ('standardize', StandardScaler(), ['duration','titleLength'])
        ])

scaler.fit_transform(X,y)

- you can think of 'standardize' as a step that applies StandardScaler() to the duration and titleLength columns
- unfortunately, this returns a single array corresponding to the newly-scaled columns, but we want it to result in an entire dataframe containing the newly-scaled columns
- why is this? check out the output of running each step in your original pipeline -- they're all dataframes, so we want to keep things consistent
- thus, we are going to wrap everything into yet another function

In [3]:
def scaling(X,y):
    scaler = ColumnTransformer([
            ('standardize', StandardScaler(), ['duration','titleLength'])
        ])
    X[['duration','titleLength']] = scaler.fit_transform(X,y)

in this function, we...
- pass in the feature and target dataframes
- create a "scaler" object as described earlier
- fit the scaler & transform the data to get the newly-scaled columns (in the form of an array) and then replace the original unscaled columns

## Pipeline
- the first two steps are the same as before
- we've added the additional step 'scale' that runs the scaling function we defined above
- in order to run gridsearch, we also need another step that refers to the model
- in this case, the step is called 'forest', and it runs the random forest regressor defined earlier

In [None]:
pipeline = Pipeline([
    ('hash', ce.HashingEncoder(cols=['category'])),
    ('onehot', ce.OneHotEncoder(cols=['currency','country','launchMonth','launchDay','launchHour'])),
    ('scale', scaling(X,y)),
    ('forest', rfr)
], memory=memory)

## GridSearch
- as you may know, gridsearch allows you to try different values and combinations of parameters
- thus, we need to create a grid of paramters to test (in the form of a dictionary)
- the syntax of each dictionary entry is 'pipelineModelStepName__parameterName' : [values to test]
- you can find the list of possible parameterNames in the documentation for DT/RF

In [None]:
parameters = {'forest__max_depth':[10,20],
              'forest__min_samples_leaf':[25,50]}

In [None]:
gs = GridSearchCV(pipeline,parameters)

gs.fit(X_train,y_train)

scores = pd.DataFrame(gs.cv_results_).filter(regex='param_+|mean_test_score'
                                            ).sort_values('mean_test_score',
                                                          ascending=False).reset_index().drop(['index'],axis=1)
scores.head(20)