In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
dataset_path = Path().absolute() / "Data"

In [3]:
# Reading in the dataset

def read_dataset(path: Path, filename: str) -> pd.DataFrame:
    
    """Reading in the provided dataset as a pandas dataframe."""
    
    dataframe = pd.read_excel(path / filename)
    
    dataframe.columns = dataframe.columns.str.lower().str.replace(' ', '_')
    
    return dataframe

In [4]:
kickstarter_df = read_dataset(dataset_path, "Kickstarter.xlsx")

In [5]:
# Casting types properly

def validate_dtypes(type_dict: dict, df: pd.DataFrame) -> pd.DataFrame:
    
    return df.astype(type_dict)

In [6]:
kickstarter_data_types_conversion: dict[str, str] = {
    "id": "int64",
    "name": "string",
    "goal": "float64",
    "pledged": "float64",
    "state": "category",
    "disable_communication": "bool",
    "country": "string",
    "currency": "string",
    "deadline": "datetime64[ns]",
    "state_changed_at": "datetime64[ns]",
    "created_at": "datetime64[ns]",
    "launched_at": "datetime64[ns]",
    "staff_pick": "bool",
    "backers_count": "int64",
    "static_usd_rate": "float64",
    "usd_pledged": "float64",
    "category": "category",
    "spotlight": "bool",
    "name_len": "float64",
    "name_len_clean": "float64",
    "blurb_len": "float64",
    "blurb_len_clean": "float64",
    "deadline_weekday": "string",
    "state_changed_at_weekday": "string",
    "created_at_weekday": "string",
    "launched_at_weekday": "string",
    "deadline_month": "int64",
    "deadline_day": "int64",
    "deadline_yr": "int64",
    "deadline_hr": "int64",
    "state_changed_at_month": "int64",
    "state_changed_at_day": "int64",
    "state_changed_at_yr": "int64",
    "state_changed_at_hr": "int64",
    "created_at_month": "int64",
    "created_at_day": "int64",
    "created_at_yr": "int64",
    "created_at_hr": "int64",
    "launched_at_month": "int64",
    "launched_at_day": "int64",
    "launched_at_yr": "int64",
    "launched_at_hr": "int64",
    "create_to_launch_days": "int64",
    "launch_to_deadline_days": "int64",
    "launch_to_state_change_days": "int64"
}

In [7]:
kickstarter_df = validate_dtypes(kickstarter_data_types_conversion, kickstarter_df)

In [8]:
kickstarter_df.shape

(15474, 45)

In [9]:
# REMOVING ANY ATTRIBUTES THAT WERE NOT AVAILABLE AT THE MOMENT THE PROJECT WAS LAUNCHED

kickstarter_df.drop(['id', 'pledged', 'disable_communication', 'state_changed_at', 'staff_pick', 'backers_count', 'static_usd_rate', 'usd_pledged', 'spotlight', 'state_changed_at_weekday', 'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr', 'state_changed_at_hr', 'launch_to_state_change_days'], axis = 1, inplace = True)

In [10]:
# dropping this since the last columns capture the time relationship better. not productive to have time-series data 
kickstarter_df.drop(['deadline','created_at', 'launched_at'], axis = 1, inplace = True)

In [11]:
# Only Including Observations where the variable 'state' takes the value 'successful' or 'failure'
kickstarter_df = kickstarter_df[(kickstarter_df['state'] == 'successful') | (kickstarter_df['state'] == 'failed')]

In [12]:
kickstarter_df.reset_index(drop = True, inplace = True)

In [14]:
kickstarter_df.rename(columns = {"launch_to_deadline_days": "campaign_length_days"}, inplace = True)

In [15]:
kickstarter_df.shape

(13435, 27)

## Pre-Processing

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report

In [17]:
label_encoders = {}

In [18]:
for col in kickstarter_df.select_dtypes(include = ['string', 'category']).columns:
    
    if col != 'state':
        le = LabelEncoder()
        kickstarter_df[col] = le.fit_transform(kickstarter_df[col].astype(str))
        label_encoders[col] = le

## Separating Predictors and Target

In [19]:
X = kickstarter_df.drop('state', axis = 1)

In [20]:
Y = kickstarter_df['state'].apply(lambda x: 0 if x == 'failed' else 1)

## Train-Test Split

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

## Random-Forest Classifier Setup

In [22]:
randomForestClassifier = RandomForestClassifier(random_state = 42)

In [23]:
randomForestClassifier.fit(X_train, Y_train)

In [24]:
randomForestPredictions = randomForestClassifier.predict(X_test)

In [25]:
rf_report = classification_report(Y_test, randomForestPredictions)

In [26]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.75      0.89      0.82      2652
           1       0.68      0.44      0.53      1379

    accuracy                           0.74      4031
   macro avg       0.72      0.66      0.67      4031
weighted avg       0.73      0.74      0.72      4031



In [27]:
from sklearn.metrics import accuracy_score

In [28]:
print(accuracy_score(Y_test, randomForestPredictions) * 100)

73.70379558422228


## Gradient Boosting Classifier Setup

In [29]:
gb_classifier = GradientBoostingClassifier(random_state = 42)

In [30]:
gb_classifier.fit(X_train, Y_train)

In [31]:
Y_pred_gb = gb_classifier.predict(X_test)

In [32]:
gb_report = classification_report(Y_test, Y_pred_gb)

In [33]:
print(gb_report)

              precision    recall  f1-score   support

           0       0.77      0.87      0.82      2652
           1       0.67      0.50      0.57      1379

    accuracy                           0.74      4031
   macro avg       0.72      0.69      0.69      4031
weighted avg       0.73      0.74      0.73      4031



In [34]:
accuracy_score(Y_test, Y_pred_gb) * 100

74.32398908459439