In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
dataset_path = Path().absolute() / "Data"

In [3]:
# Reading in the dataset

def read_dataset(path: Path, filename: str) -> pd.DataFrame:
    
    """Reading in the provided dataset as a pandas dataframe."""
    
    dataframe = pd.read_excel(path / filename)
    
    dataframe.columns = dataframe.columns.str.lower().str.replace(' ', '_')
    
    return dataframe

In [4]:
kickstarter_df = read_dataset(dataset_path, "Kickstarter.xlsx")

In [5]:
# Casting types properly

def validate_dtypes(type_dict: dict, df: pd.DataFrame) -> pd.DataFrame:
    
    return df.astype(type_dict)

In [6]:
kickstarter_data_types_conversion: dict[str, str] = {
    "id": "int64",
    "name": "string",
    "goal": "float64",
    "pledged": "float64",
    "state": "category",
    "disable_communication": "bool",
    "country": "string",
    "currency": "string",
    "deadline": "datetime64[ns]",
    "state_changed_at": "datetime64[ns]",
    "created_at": "datetime64[ns]",
    "launched_at": "datetime64[ns]",
    "staff_pick": "bool",
    "backers_count": "int64",
    "static_usd_rate": "float64",
    "usd_pledged": "float64",
    "category": "category",
    "spotlight": "bool",
    "name_len": "float64",
    "name_len_clean": "float64",
    "blurb_len": "float64",
    "blurb_len_clean": "float64",
    "deadline_weekday": "string",
    "state_changed_at_weekday": "string",
    "created_at_weekday": "string",
    "launched_at_weekday": "string",
    "deadline_month": "int64",
    "deadline_day": "int64",
    "deadline_yr": "int64",
    "deadline_hr": "int64",
    "state_changed_at_month": "int64",
    "state_changed_at_day": "int64",
    "state_changed_at_yr": "int64",
    "state_changed_at_hr": "int64",
    "created_at_month": "int64",
    "created_at_day": "int64",
    "created_at_yr": "int64",
    "created_at_hr": "int64",
    "launched_at_month": "int64",
    "launched_at_day": "int64",
    "launched_at_yr": "int64",
    "launched_at_hr": "int64",
    "create_to_launch_days": "int64",
    "launch_to_deadline_days": "int64",
    "launch_to_state_change_days": "int64"
}

In [7]:
kickstarter_df = validate_dtypes(kickstarter_data_types_conversion, kickstarter_df)

In [8]:
kickstarter_df.shape

(15474, 45)

In [9]:
# REMOVING ANY ATTRIBUTES THAT WERE NOT AVAILABLE AT THE MOMENT THE PROJECT WAS LAUNCHED

kickstarter_df.drop(['id', 'pledged', 'disable_communication', 'state_changed_at', 'staff_pick', 'backers_count', 'static_usd_rate', 'usd_pledged', 'spotlight', 'state_changed_at_weekday', 'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr', 'state_changed_at_hr', 'launch_to_state_change_days'], axis = 1, inplace = True)

In [10]:
# dropping this since the last columns capture the time relationship better. not productive to have time-series data 
kickstarter_df.drop(['deadline','created_at', 'launched_at'], axis = 1, inplace = True)

In [11]:
# Only Including Observations where the variable 'state' takes the value 'successful' or 'failure'
kickstarter_df = kickstarter_df[(kickstarter_df['state'] == 'successful') | (kickstarter_df['state'] == 'failed')]

In [12]:
kickstarter_df.reset_index(drop = True, inplace = True)

In [14]:
kickstarter_df.rename(columns = {"launch_to_deadline_days": "campaign_length_days"}, inplace = True)

In [15]:
kickstarter_df.shape

(13435, 27)

## Pre-Processing

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report

In [17]:
label_encoders = {}

In [18]:
for col in kickstarter_df.select_dtypes(include = ['string', 'category']).columns:
    
    if col != 'state':
        le = LabelEncoder()
        kickstarter_df[col] = le.fit_transform(kickstarter_df[col].astype(str))
        label_encoders[col] = le

## Separating Predictors and Target

In [19]:
X = kickstarter_df.drop('state', axis = 1)

In [20]:
Y = kickstarter_df['state'].apply(lambda x: 0 if x == 'failed' else 1)

## Train-Test Split

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

## Random-Forest Classifier Setup

In [22]:
randomForestClassifier = RandomForestClassifier(random_state = 42)

In [23]:
randomForestClassifier.fit(X_train, Y_train)

In [24]:
randomForestPredictions = randomForestClassifier.predict(X_test)

In [25]:
rf_report = classification_report(Y_test, randomForestPredictions)

In [26]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.75      0.89      0.82      2652
           1       0.68      0.44      0.53      1379

    accuracy                           0.74      4031
   macro avg       0.72      0.66      0.67      4031
weighted avg       0.73      0.74      0.72      4031



In [27]:
from sklearn.metrics import accuracy_score

In [28]:
print(accuracy_score(Y_test, randomForestPredictions) * 100)

73.70379558422228


## Gradient Boosting Classifier Setup

In [29]:
gb_classifier = GradientBoostingClassifier(random_state = 42)

In [30]:
gb_classifier.fit(X_train, Y_train)

In [31]:
Y_pred_gb = gb_classifier.predict(X_test)

In [32]:
gb_report = classification_report(Y_test, Y_pred_gb)

In [33]:
print(gb_report)

              precision    recall  f1-score   support

           0       0.77      0.87      0.82      2652
           1       0.67      0.50      0.57      1379

    accuracy                           0.74      4031
   macro avg       0.72      0.69      0.69      4031
weighted avg       0.73      0.74      0.73      4031



In [34]:
accuracy_score(Y_test, Y_pred_gb) * 100

74.32398908459439

## Hyperparameter Tuning

In [35]:
from sklearn.model_selection import GridSearchCV

In [38]:
# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize the GridSearchCV object for Random Forest
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                              param_grid=param_grid_rf,
                              cv=5, # 3-fold cross-validation
                              n_jobs=-1, # Use all available cores
                              verbose=2)

# Perform grid search
grid_search_rf.fit(X_train, Y_train)

# Best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_
(best_params_rf, best_score_rf)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


({'bootstrap': False,
  'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.7320289682943658)

In [39]:
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize the GridSearchCV object for Gradient Boosting
grid_search_gb = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                              param_grid=param_grid_gb,
                              cv=5, # 3-fold cross-validation
                              n_jobs=-1, # Use all available cores
                              verbose=2)

# Perform grid search
grid_search_gb.fit(X_train, Y_train)

# Best parameters and best score
best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_
(best_params_gb, best_score_gb)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


({'learning_rate': 0.1,
  'max_depth': 5,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.7421306005180586)

In [40]:
# Retrain the Random Forest classifier with the best parameters
rf_classifier_optimized = RandomForestClassifier(**best_params_rf, random_state=42)
rf_classifier_optimized.fit(X_train, Y_train)

# Predict on the testing set with the optimized model
rf_predictions_optimized = rf_classifier_optimized.predict(X_test)

# Evaluate the optimized Random Forest classifier
rf_report_optimized = classification_report(Y_test, rf_predictions_optimized)


'              precision    recall  f1-score   support\n\n           0       0.76      0.87      0.81      2652\n           1       0.66      0.46      0.54      1379\n\n    accuracy                           0.73      4031\n   macro avg       0.71      0.67      0.68      4031\nweighted avg       0.72      0.73      0.72      4031\n'

In [41]:
print(rf_report_optimized)

              precision    recall  f1-score   support

           0       0.76      0.87      0.81      2652
           1       0.66      0.46      0.54      1379

    accuracy                           0.73      4031
   macro avg       0.71      0.67      0.68      4031
weighted avg       0.72      0.73      0.72      4031



In [43]:
print(accuracy_score(Y_test, rf_predictions_optimized))

0.7333167948399901


In [44]:
# Retrain the Gradient Boosting classifier with the best parameters
gb_classifier_optimized = GradientBoostingClassifier(**best_params_gb, random_state=42)
gb_classifier_optimized.fit(X_train, Y_train)

# Predict on the testing set with the optimized model
gb_predictions_optimized = gb_classifier_optimized.predict(X_test)

# Evaluate the optimized Gradient Boosting classifier
gb_report_optimized = classification_report(Y_test, gb_predictions_optimized)
gb_report_optimized

'              precision    recall  f1-score   support\n\n           0       0.78      0.86      0.82      2652\n           1       0.67      0.54      0.60      1379\n\n    accuracy                           0.75      4031\n   macro avg       0.73      0.70      0.71      4031\nweighted avg       0.74      0.75      0.74      4031\n'