In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
dataset_path = Path().absolute() / "Data"

In [5]:
# Reading in the dataset

def read_dataset(path: Path, filename: str) -> pd.DataFrame:
    
    """Reading in the provided dataset as a pandas dataframe."""
    
    dataframe = pd.read_excel(path / filename)
    
    dataframe.columns = dataframe.columns.str.lower().str.replace(' ', '_')
    
    return dataframe

In [6]:
kickstarter_df = read_dataset(dataset_path, "Kickstarter.xlsx")

In [7]:
# Casting types properly

def validate_dtypes(type_dict: dict, df: pd.DataFrame) -> pd.DataFrame:
    
    return df.astype(type_dict)

In [8]:
kickstarter_data_types_conversion: dict[str, str] = {
    "id": "int64",
    "name": "string",
    "goal": "float64",
    "pledged": "float64",
    "state": "category",
    "disable_communication": "bool",
    "country": "string",
    "currency": "string",
    "deadline": "datetime64[ns]",
    "state_changed_at": "datetime64[ns]",
    "created_at": "datetime64[ns]",
    "launched_at": "datetime64[ns]",
    "staff_pick": "bool",
    "backers_count": "int64",
    "static_usd_rate": "float64",
    "usd_pledged": "float64",
    "category": "category",
    "spotlight": "bool",
    "name_len": "float64",
    "name_len_clean": "float64",
    "blurb_len": "float64",
    "blurb_len_clean": "float64",
    "deadline_weekday": "string",
    "state_changed_at_weekday": "string",
    "created_at_weekday": "string",
    "launched_at_weekday": "string",
    "deadline_month": "int64",
    "deadline_day": "int64",
    "deadline_yr": "int64",
    "deadline_hr": "int64",
    "state_changed_at_month": "int64",
    "state_changed_at_day": "int64",
    "state_changed_at_yr": "int64",
    "state_changed_at_hr": "int64",
    "created_at_month": "int64",
    "created_at_day": "int64",
    "created_at_yr": "int64",
    "created_at_hr": "int64",
    "launched_at_month": "int64",
    "launched_at_day": "int64",
    "launched_at_yr": "int64",
    "launched_at_hr": "int64",
    "create_to_launch_days": "int64",
    "launch_to_deadline_days": "int64",
    "launch_to_state_change_days": "int64"
}

In [9]:
kickstarter_df = validate_dtypes(kickstarter_data_types_conversion, kickstarter_df)

In [12]:
# REMOVING ANY ATTRIBUTES THAT WERE NOT AVAILABLE AT THE MOMENT THE PROJECT WAS LAUNCHED

kickstarter_df.drop(['id', 'pledged', 'disable_communication', 'state_changed_at', 'staff_pick', 'backers_count', 'static_usd_rate', 'usd_pledged', 'spotlight', 'state_changed_at_weekday', 'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr', 'state_changed_at_hr', 'launch_to_state_change_days'], axis = 1, inplace = True)

In [16]:
# Only Including Observations where the variable 'state' takes the value 'successful' or 'failure'
kickstarter_df = kickstarter_df[(kickstarter_df['state'] == 'successful') | (kickstarter_df['state'] == 'failed')]

In [22]:
kickstarter_df.reset_index(drop = True, inplace = True)

In [24]:
# Checking for Missing Values

missing_values = kickstarter_df.isnull().sum()

print(missing_values)


name                          1
goal                          0
state                         0
country                       0
currency                      0
deadline                      0
created_at                    0
launched_at                   0
category                   1254
name_len                      0
name_len_clean                0
blurb_len                     0
blurb_len_clean               0
deadline_weekday              0
created_at_weekday            0
launched_at_weekday           0
deadline_month                0
deadline_day                  0
deadline_yr                   0
deadline_hr                   0
created_at_month              0
created_at_day                0
created_at_yr                 0
created_at_hr                 0
launched_at_month             0
launched_at_day               0
launched_at_yr                0
launched_at_hr                0
create_to_launch_days         0
launch_to_deadline_days       0
dtype: int64
