import stuff (a lot of stuff) & read in data

In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce
import datetime as dt
pd.options.mode.chained_assignment = None
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
pd.options.display.max_rows = 1000
pd.options.display.max_columns= 1000
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv('../S20_insight_team/datasets/ks-projects-201801.csv')

# data cleaning / transforming

create a new column denoting length of project name (number of characters)

In [None]:
data['titleLength'] = data['name'].apply(lambda x: len(str(x)))

drop columns that won't be used at the moment (main_category, ID, name, state, pledged, usd pledged, goal)

In [None]:
data.drop(columns=['main_category','ID','name','state','pledged','usd pledged','goal'],inplace=True)

format date columns properly and get rid of records with invalid dates (precede the year 2000)

In [None]:
data['launched'] = pd.to_datetime(data['launched'])
data['deadline'] = pd.to_datetime(data['deadline'])
df = data[data['launched']>'2000-01-01']

eliminate projects with usd pledged real > 20000 or usd pledged real = 0

TODO: write a brief explanation as to why you are doing this (double-click on this cell to edit the text)

In [None]:
df = data[(data['usd_pledged_real']<20000) & (data['usd_pledged_real']>0)]

create logged versions of usd pledged real and usd goal real + drop the original nonlogged versions

TODO: write a brief explanation as to why you are doing this



In [None]:
df['logPledged'] = np.log(df['usd_pledged_real'])
df['logGoal'] = np.log(df['usd_goal_real'])

df.drop(columns=['usd_goal_real','usd_pledged_real'], inplace=True)

create columns to encode the 'launched' column -- I chose to break it down into month, day of week, and hour

In [None]:
df['launchMonth'] = df['launched'].dt.month
df['launchDay'] = df['launched'].dt.dayofweek
df['launchHour'] = df['launched'].dt.hour

calculate the duration of each project campaign + drop the launched & deadline columns

In [None]:
df['duration'] = (df['deadline']-df['launched'])/dt.timedelta(minutes=1)
df.drop(columns=['deadline','launched'],inplace=True)

# model preparation

divide dataset into features (X) and target (y)

In [None]:
X = df.drop(columns=['logPledged'])
y = df['logPledged']

prepare the pipeline (for help, see bottom of this article https://kiwidamien.github.io/encoding-categorical-variables.html)


In [None]:
#some helper code for you
import joblib
location = 'cache'
memory = joblib.Memory(location=location, verbose=10)

encoding_pipeline = Pipeline([
    ('encode_category', ce.HashingEncoder(cols=['category'], return_df=True)),
    ('encode_other', ce.OneHotEncoder(cols=['currency','country','launchMonth','launchDay','launchHour'], return_df=True)),
], memory=memory)

train-test-split your data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

encode your training set and test set (for reference, see documentation here https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

TODO: explain when to use fit_transform( ) vs transform( ) and why

In [None]:
X_train_encoded = encoding_pipeline.fit_transform(X_train, y_train)
X_test_encoded = encoding_pipeline.transform(X_test)

check to make sure that the dimensions align between test & training

In [None]:
print(X_train_encoded.shape)
print(X_test_encoded.shape)

In [None]:
X_test_encoded.head()

# linear regression

create & fit linear regression model on training set

In [None]:
linreg = LinearRegression()
linreg.fit(X_train_encoded,y_train)

calculate & output MAE, MSE, RMSE based on test set AND interpret them in the context of this project

In [None]:
mae = metrics.mean_absolute_error(y_test,linreg.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, linreg.predict(X_test_encoded))

In [None]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

# DTR / RF
I'm assuming you get the hang of things by now -- just repeat the process

*don't worry about using GridSearch at the moment

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_encoded,y_train)

In [None]:
mae = metrics.mean_absolute_error(y_test,dtr.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,dtr.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,dtr.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, dtr.predict(X_test_encoded))

In [None]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train_encoded,y_train)

In [None]:
mae = metrics.mean_absolute_error(y_test,rfr.predict(X_test_encoded))
mse = metrics.mean_squared_error(y_test,rfr.predict(X_test_encoded))
rmse = np.sqrt(metrics.mean_squared_error(y_test,rfr.predict(X_test_encoded)))
r2 = metrics.r2_score(y_test, rfr.predict(X_test_encoded))

In [None]:
print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

In [None]:
# compute feature importances
pd.DataFrame({'feature':X_test_encoded.columns, 
              'importance':rfr.feature_importances_}).sort_values(by='importance',ascending=False)

# using gridsearch with pipelines,  creating a customn scaler

In [None]:
#no need to TTS when using GridSearch
X = data.drop(columns=['logPledged'])
y = data['logPledged']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
data.describe()

## Standard Scaler
- standardize features by removing the mean and scaling to unit variance
- calculated as: z = (x - u) / s
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

### Implementation

In [None]:
scaler = StandardScaler()
scaler.fit(data)

- as you can see, the basic implementation doesn't really work here since it can only apply the scaler to the entire dataset, which contains categorical columns
- thus, we are going to wrap it into a function that allows us to apply the scaler to specific functions

### ColumnTransformer
- applies transformers to columns of an array or dataframe
- allows different columns to be transformed separately, which is useful for heterogeneous data

In [None]:
#why we need to wrap this into a function
scaler = ColumnTransformer([
            ('standardize', StandardScaler(), ['duration','titleLength'])
        ])

scaler.fit_transform(X,y)

- you can think of 'standardize' as a step that applies StandardScaler() to the duration and titleLength columns
- unfortunately, this returns a single array corresponding to the newly-scaled columns, but we want it to result in an entire dataframe containing the newly-scaled columns
- why is this? check out the output of running each step in your original pipeline -- they're all dataframes, so we want to keep things consistent
- thus, we are going to wrap everything into yet another function

In [3]:
def scaling(X,y):
    scaler = ColumnTransformer([
            ('standardize', StandardScaler(), ['duration','titleLength'])
        ])
    X[['duration','titleLength']] = scaler.fit_transform(X,y)

in this function, we...
- pass in the feature and target dataframes
- create a "scaler" object as described earlier
- fit the scaler & transform the data to get the newly-scaled columns (in the form of an array) and then replace the original unscaled columns

## Pipeline
- the first two steps are the same as before
- we've added the additional step 'scale' that runs the scaling function we defined above
- in order to run gridsearch, we also need another step that refers to the model
- in this case, the step is called 'forest', and it runs the random forest regressor defined earlier

In [None]:
pipeline = Pipeline([
    ('hash', ce.HashingEncoder(cols=['category'])),
    ('onehot', ce.OneHotEncoder(cols=['currency','country','launchMonth','launchDay','launchHour'])),
    ('scale', scaling(X,y)),
    ('forest', rfr)
], memory=memory)

## GridSearch
- as you may know, gridsearch allows you to try different values and combinations of parameters
- thus, we need to create a grid of paramters to test (in the form of a dictionary)
- the syntax of each dictionary entry is 'pipelineModelStepName__parameterName' : [values to test]
- you can find the list of possible parameterNames in the documentation for DT/RF

In [None]:
parameters = {'forest__max_depth':[10,20],
              'forest__min_samples_leaf':[25,50]}

In [None]:
gs = GridSearchCV(pipeline,parameters)

gs.fit(X_train,y_train)

scores = pd.DataFrame(gs.cv_results_).filter(regex='param_+|mean_test_score'
                                            ).sort_values('mean_test_score',
                                                          ascending=False).reset_index().drop(['index'],axis=1)
scores.head(20)