# Crowdfunding Success Prediction

## Data Preprocessing

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.preprocessing import StandardScaler

import utils

In [14]:
# load the excel file
df = pd.read_excel('../data/Crowdfunding campaign.xlsx', sheet_name='Sheet1')
df

Unnamed: 0,cid,pre_order_perk,amt_goal,frac_raised,tech_campaign,sbiz_campaign,fixed_fund,state,year_start,iot_plus,delivery
0,71762,0,2500,0.040000,0,1,0.0,Kentucky,2011,0.0,
1,72367,0,1500,1.733333,0,0,0.0,Indiana,2012,0.0,
2,73646,1,100000,1.007510,1,0,0.0,Louisiana,2012,0.0,
3,74867,1,7000,0.000000,0,1,0.0,North Carolina,2012,0.0,
4,82829,1,2000,1.000000,0,1,0.0,Pennsylvania,2011,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
434,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
435,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
436,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
437,250311,1,5000,1.007000,0,1,0.0,Kentucky,2012,0.0,


In [15]:
# Check for missing values
df.isnull().sum()

cid                 0
pre_order_perk      0
amt_goal            0
frac_raised         0
tech_campaign       0
sbiz_campaign       0
fixed_fund          6
state               0
year_start          0
iot_plus            4
delivery          289
dtype: int64

In [16]:
# find duplicates
df[df.duplicated()]

Unnamed: 0,cid,pre_order_perk,amt_goal,frac_raised,tech_campaign,sbiz_campaign,fixed_fund,state,year_start,iot_plus,delivery
224,71762,0,2500,0.040000,0,1,0.0,Kentucky,2011,0.0,
225,72367,0,1500,1.733333,0,0,0.0,Indiana,2012,0.0,
226,72367,0,1500,1.733333,0,0,0.0,Indiana,2012,0.0,
227,72367,0,1500,1.733333,0,0,0.0,Indiana,2012,0.0,
228,73646,1,100000,1.007510,1,0,0.0,Louisiana,2012,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
434,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
435,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
436,242139,1,100000,2.139170,1,0,1.0,California,2013,1.0,
437,250311,1,5000,1.007000,0,1,0.0,Kentucky,2012,0.0,


In [17]:
# drop duplicates
df = df.drop_duplicates()
df

Unnamed: 0,cid,pre_order_perk,amt_goal,frac_raised,tech_campaign,sbiz_campaign,fixed_fund,state,year_start,iot_plus,delivery
0,71762,0,2500,0.040000,0,1,0.0,Kentucky,2011,0.0,
1,72367,0,1500,1.733333,0,0,0.0,Indiana,2012,0.0,
2,73646,1,100000,1.007510,1,0,0.0,Louisiana,2012,0.0,
3,74867,1,7000,0.000000,0,1,0.0,North Carolina,2012,0.0,
4,82829,1,2000,1.000000,0,1,0.0,Pennsylvania,2011,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
219,248410,0,1500,1.088000,0,1,0.0,Illinois,2012,0.0,
220,248622,0,10000,1.005000,0,1,0.0,New York,2012,0.0,
221,250311,1,5000,1.007000,0,1,0.0,Kentucky,2012,0.0,
222,256062,1,2000,8.188500,1,0,0.0,Georgia,2013,0.0,1.0


In [18]:
# drop the "delivery" column
df = df.drop(columns=['delivery'])

In [19]:
# drop rows with missing values
df = df.dropna()

## Exploratory Data Analysis

In [20]:
# Describe the data
df.describe()

Unnamed: 0,cid,pre_order_perk,amt_goal,frac_raised,tech_campaign,sbiz_campaign,fixed_fund,year_start,iot_plus
count,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0
mean,164216.592593,0.509259,17533.518519,1.801634,0.462963,0.412037,0.212963,2012.648148,0.12037
std,41130.626378,0.501076,31481.563736,3.699447,0.499785,0.493345,0.410353,0.56757,0.32615
min,71762.0,0.0,500.0,0.0,0.0,0.0,0.0,2011.0,0.0
25%,137707.75,0.0,1462.5,0.008788,0.0,0.0,0.0,2012.0,0.0
50%,158916.0,1.0,5000.0,1.019715,0.0,0.0,0.0,2013.0,0.0
75%,194429.5,1.0,15000.0,1.750967,1.0,1.0,0.0,2013.0,0.0
max,256198.0,1.0,200000.0,25.91808,1.0,1.0,1.0,2013.0,1.0


## Feature Engineering

In [21]:
# Split the data into features and target
X = df.drop(columns=['cid', 'state', 'frac_raised'])
y = df['frac_raised']

### Remove Highly Correlated Features

In [22]:
# remove columns with high multicollinearity
X_vif = utils.calculate_vif(X, threshold=5)
X_vif

Removed variables with high VIF:
year_start: 10.02


Unnamed: 0,pre_order_perk,amt_goal,tech_campaign,sbiz_campaign,fixed_fund,iot_plus
0,0,2500,0,1,0.0,0.0
1,0,1500,0,0,0.0,0.0
2,1,100000,1,0,0.0,0.0
3,1,7000,0,1,0.0,0.0
4,1,2000,0,1,0.0,0.0
...,...,...,...,...,...,...
219,0,1500,0,1,0.0,0.0
220,0,10000,0,1,0.0,0.0
221,1,5000,0,1,0.0,0.0
222,1,2000,1,0,0.0,0.0


### Feature Scaling (for linear models)

In [24]:
# scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

Unnamed: 0,pre_order_perk,amt_goal,tech_campaign,sbiz_campaign,fixed_fund,year_start,iot_plus
0,-1.018693,-0.478643,-0.928477,1.194557,-0.520181,-2.910611,-0.369922
1,-1.018693,-0.510482,-0.928477,-0.837130,-0.520181,-1.144622,-0.369922
2,0.981650,2.625602,1.077033,-0.837130,-0.520181,-1.144622,-0.369922
3,0.981650,-0.335370,-0.928477,1.194557,-0.520181,-1.144622,-0.369922
4,0.981650,-0.494563,-0.928477,1.194557,-0.520181,-2.910611,-0.369922
...,...,...,...,...,...,...,...
211,-1.018693,-0.510482,-0.928477,1.194557,-0.520181,-1.144622,-0.369922
212,-1.018693,-0.239855,-0.928477,1.194557,-0.520181,-1.144622,-0.369922
213,0.981650,-0.399047,-0.928477,1.194557,-0.520181,-1.144622,-0.369922
214,0.981650,-0.494563,1.077033,-0.837130,-0.520181,0.621366,-0.369922


## Model Selection

### Scoring and Cross-Validation

In [28]:
# Define a custom scorer (mean absolute error)
mae_scorer = make_scorer(mean_absolute_error)

In [29]:
# Define the cross-validation strategy
cv = LeaveOneOut()

### Linear Regression

In [31]:
# create a linear regression model
lr_model = LinearRegression()

# evaluate the model
scores = cross_val_score(lr_model, X_scaled, y, scoring=mae_scorer, cv=cv)

# print the mean MAE
scores.mean()

1.8696389013657067

### Random Forest

In [32]:
from sklearn.ensemble import RandomForestRegressor

# create a random forest model
rf_model = RandomForestRegressor(random_state=42)

# evaluate the model
scores = cross_val_score(rf_model, X, y, scoring=mae_scorer, cv=cv)

# print the mean MAE
scores.mean()

2.0841105120144956

#### Hyperparameter Tuning

In [38]:
from sklearn.model_selection import RandomizedSearchCV

# define the hyperparameter grid
from scipy.stats import randint

param_distributions = {
    'n_estimators': randint(100, 1000),  # Number of trees in the forest
    'max_depth': randint(10, 100),  # Maximum depth of each tree
    'min_samples_split': randint(2, 20),  # Minimum number of samples to split a node
    'min_samples_leaf': randint(1, 10),  # Minimum number of samples required at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

In [39]:
# create a random search object
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=50,
    scoring=mae_scorer,
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_random_search.fit(X, y)

Fitting 216 folds for each of 50 candidates, totalling 10800 fits


In [40]:
# Print the best parameters and score
print("Best Parameters: ", rf_random_search.best_params_)
print("Best Score: ", rf_random_search.best_score_)

Best Parameters:  {'bootstrap': False, 'max_depth': 90, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 741}
Best Score:  2.195203838096976
