In [3]:
import pandas as pd

In [16]:
all_cols = ['Unnamed: 0', 'id', 'photo', 'name', 'blurb', 'goal', 'pledged',
       'state', 'slug', 'disable_communication', 'country', 'currency',
       'currency_symbol', 'currency_trailing_code', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'staff_pick',
       'backers_count', 'static_usd_rate', 'usd_pledged', 'creator',
       'location', 'category', 'profile', 'spotlight', 'urls', 'source_url',
       'friends', 'is_starred', 'is_backing', 'permissions', 'name_len',
       'name_len_clean', 'blurb_len', 'blurb_len_clean', 'deadline_weekday',
       'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday',
       'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr',
       'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr',
       'state_changed_at_hr', 'created_at_month', 'created_at_day',
       'created_at_yr', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch', 'launch_to_deadline', 'launch_to_state_change']
cols_to_drop = ['Unnamed: 0','urls','name','blurb','id','slug',
                'photo','currency_trailing_code','friends','is_starred','is_backing','permissions','name_len','blurb_len','profile','static_usd_rate']
datetime_columns = ['state_changed_at','created_at', 'launched_at','deadline']

year_cols = ['created_at_yr', 'deadline_yr','state_changed_at_yr', 'launched_at_yr']

month_day_hr_cols = ['state_changed_at_month','created_at_day','created_at_month', 'created_at_hr', 'launched_at_month','launched_at_day','launched_at_hr',
        'deadline_month','deadline_day','deadline_hr', 'state_changed_at_day', 'state_changed_at_hr', ]
cat = ['deadline_weekday', 'created_at_weekday', 'state_changed_at_weekday', 'currency_symbol'
       'country', 'currency','source_url',
       'state','launched_at_weekday'
       ]

dtype_dict = {
    **{col: 'int16' for col in year_cols},
    **{col: 'int8' for col in month_day_hr_cols},
    **{col: 'category' for col in cat},
    'backers_count':'int32'
}

for i in datetime_columns:
    cols_to_drop.append(i)
cols_to_keep = [col for col in all_cols if col not in cols_to_drop]

In [17]:
df = pd.read_csv("kickstarter_data_with_features.csv", usecols=cols_to_keep, dtype=dtype_dict)

df.columns

Index(['goal', 'pledged', 'state', 'disable_communication', 'country',
       'currency', 'currency_symbol', 'staff_pick', 'backers_count',
       'usd_pledged', 'creator', 'location', 'category', 'spotlight',
       'source_url', 'name_len_clean', 'blurb_len_clean', 'deadline_weekday',
       'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday',
       'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr',
       'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr',
       'state_changed_at_hr', 'created_at_month', 'created_at_day',
       'created_at_yr', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch', 'launch_to_deadline', 'launch_to_state_change'],
      dtype='object')

In [18]:
df['currency_symbol'] = df['currency_symbol'].astype('category').cat.codes
df['country'] = df['country'].astype('category').cat.codes
df['currency'] = df['currency'].astype('category').cat.codes
df['state'] = df['state'].astype('category').cat.codes
df['source_url'] = df['source_url'].astype('category').cat.codes #this probably has high correlation with category

In [19]:
df['launch_to_deadline'] = df['launch_to_deadline'].astype(str).str.extract(r'(\d+ days)')
df['create_to_launch'] = df['create_to_launch'].astype(str).str.extract(r'(\d+ days)')
df['launch_to_state_change'] = df['launch_to_state_change'].astype(str).str.extract(r'(\d+ days)')

In [20]:
df['name_len_clean'] = df['name_len_clean'].fillna(df['name_len_clean'].median())
df['name_len_clean'] = df['name_len_clean'].astype('int16')
df['blurb_len_clean'] = df['blurb_len_clean'].fillna(df['blurb_len_clean'].median())
df['blurb_len_clean'] = df['blurb_len_clean'].astype('int16')



In [21]:
df['launch_to_deadline'] = df['launch_to_deadline'].astype(str).str.extract(r'(\d+)')
df['create_to_launch'] = df['create_to_launch'].astype(str).str.extract(r'(\d+)')
df['launch_to_state_change'] = df['launch_to_state_change'].astype(str).str.extract(r'(\d+)')

In [22]:
df['launch_to_deadline'] = pd.to_numeric(df['launch_to_deadline'])
df['create_to_launch'] = pd.to_numeric(df['create_to_launch'])
df['launch_to_state_change'] = pd.to_numeric(df['launch_to_state_change'])

In [38]:
df['category'] = df['category'].astype('category').cat.codes

df['deadline_weekday'] = df['deadline_weekday'].cat.codes
df['state_changed_at_weekday'] = df['state_changed_at_weekday'].cat.codes
df['created_at_weekday'] = df['created_at_weekday'].cat.codes
df['launched_at_weekday'] = df['launched_at_weekday'].cat.codes


In [39]:
print(df.dtypes)

goal                        float64
pledged                     float64
state                          int8
disable_communication          bool
country                        int8
currency                       int8
currency_symbol                int8
staff_pick                     bool
backers_count                 int32
usd_pledged                 float64
creator                      object
location                     object
category                       int8
spotlight                      bool
source_url                     int8
name_len_clean                int16
blurb_len_clean               int16
deadline_weekday               int8
state_changed_at_weekday       int8
created_at_weekday             int8
launched_at_weekday            int8
deadline_month                 int8
deadline_day                   int8
deadline_yr                   int16
deadline_hr                    int8
state_changed_at_month         int8
state_changed_at_day           int8
state_changed_at_yr         

In [26]:
print(df.nunique())

goal                         1204
pledged                      8722
state                           5
disable_communication           2
country                        21
currency                       13
currency_symbol                 5
staff_pick                      2
backers_count                1505
usd_pledged                 12235
creator                     20522
location                     5178
category                       25
spotlight                       2
source_url                     33
name_len_clean                 14
blurb_len_clean                29
deadline_weekday                7
state_changed_at_weekday        7
created_at_weekday              7
launched_at_weekday             7
deadline_month                 12
deadline_day                   31
deadline_yr                     9
deadline_hr                    24
state_changed_at_month         12
state_changed_at_day           31
state_changed_at_yr             9
state_changed_at_hr            24
created_at_mon

In [118]:
df['percent_of_goal'] = df['pledged']/df['goal']
df['percent_of_goal'].describe()

count    20632.000000
mean         3.232088
std        177.318223
min          0.000000
25%          0.001500
50%          0.065439
75%          1.030000
max      22603.000000
Name: percent_of_goal, dtype: float64

In [124]:
achieved_goal = df[df['percent_of_goal'] >=1]
achieved_goal.head()

Unnamed: 0,goal,pledged,state,disable_communication,country,currency,currency_symbol,staff_pick,backers_count,usd_pledged,...,created_at_yr,created_at_hr,launched_at_month,launched_at_day,launched_at_yr,launched_at_hr,create_to_launch,launch_to_deadline,launch_to_state_change,percent_of_goal
180,5000.0,5087.0,3,False,20,12,0,False,74,5087.0,...,2015,20,3,3,2015,7,33 days,45 days,45 days,1.0174
181,3000.0,3007.56,3,False,20,12,0,False,72,3007.56,...,2013,21,5,1,2013,9,8 days,30 days,30 days,1.00252
182,3000.0,4410.0,3,False,20,12,0,False,73,4410.0,...,2012,16,6,6,2012,17,4 days,10 days,10 days,1.47
183,2000.0,2041.0,3,False,20,12,0,False,34,2041.0,...,2014,7,2,28,2014,11,0 days,30 days,30 days,1.0205
184,2500.0,2524.0,3,False,20,12,0,False,32,2524.0,...,2015,8,6,12,2015,13,22 days,21 days,21 days,1.0096


In [130]:
num_success_goal = achieved_goal[achieved_goal['state'] == 3]['state'].count()
total_reached_goal = achieved_goal.shape[0]
failure_rate_when_goal_reached = (total_reached_goal-num_success_goal) / total_reached_goal
num_success_total = df[df['state'] == 3]['state'].count()
num_success_no_goal = num_success_total-num_success_goal
percent_success = num_success_total/df.shape[0]

print(f"Total success rate {percent_success}")
print(f"Success rate when goal reached {num_success_goal/total_reached_goal}")
print(f"Failure rate when goal reached: {failure_rate_when_goal_reached}")
print(f"numbers of success when goal reached: {num_success_goal}")
print(f"number of failure when goal reached: {total_reached_goal-num_success_goal}")
print(f'Number of goals reached: {total_reached_goal}')
print(f"Percent that reached goal {total_reached_goal/df.shape[0]}")
print(f'Success that didnt reach goal {num_success_no_goal}')
print(f"Success rate when goal isn't reached {num_success_no_goal/(df.shape[0]-achieved_goal.shape[0])}")


Total success rate 0.2916828227995347
Success rate when goal reached 0.9717422896818989
Failure rate when goal reached: 0.028257710318101082
numbers of success when goal reached: 6018
number of failure when goal reached: 175
Number of goals reached: 6193
Percent that reached goal 0.30016479255525397
Success that didnt reach goal 0
Success rate when goal isn't reached 0.0


In [40]:
X = df.drop(columns=['state','creator','location'], axis=1)
Y = df['state']

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X1 = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X1, columns=X.columns)
numeric_cols = X.select_dtypes(include=['number']).columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns

# Scale only numeric columns
scaler = StandardScaler()
X_scaled_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

# Combine scaled numeric columns with untransformed object columns
X_scaled = pd.concat([X_scaled_numeric, X[non_numeric_cols]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [30]:
X_train.head()

Unnamed: 0,goal,pledged,country,currency,currency_symbol,backers_count,usd_pledged,category,source_url,name_len_clean,...,create_to_launch,launch_to_deadline,launch_to_state_change,disable_communication,staff_pick,spotlight,deadline_weekday,state_changed_at_weekday,created_at_weekday,launched_at_weekday
6244,-0.033026,-0.177541,-2.061587,-2.067867,-0.514113,-0.15031,-0.181139,1.405795,0.614846,-1.775362,...,-0.419271,-0.397284,,False,False,False,Wednesday,Wednesday,Friday,Monday
9380,-0.044258,0.197473,0.626138,0.644606,-0.514113,0.145112,0.210196,0.735533,-0.091248,-0.12098,...,-0.437274,-0.313058,-0.152387,False,False,True,Monday,Monday,Thursday,Friday
6105,-0.062977,-0.177541,0.626138,0.644606,-0.514113,-0.15031,-0.181139,1.405795,0.614846,1.946998,...,-0.437274,-0.565736,,False,False,False,Friday,Friday,Wednesday,Thursday
15825,-0.045755,-0.066636,0.626138,0.644606,-0.514113,0.075553,-0.065407,-0.60499,-0.797341,0.706212,...,0.489886,0.023845,,False,False,False,Monday,Monday,Tuesday,Monday
19026,-0.048001,-0.176794,0.626138,0.644606,-0.514113,-0.147855,-0.180359,-1.677408,-1.150388,0.292616,...,-0.194232,0.866102,,False,False,False,Sunday,Sunday,Wednesday,Thursday


In [139]:
from sklearn.svm import SVC, LinearSVC
models = []
models.append(('Linear SVC', LinearSVC(C=100, loss = 'hinge', random_state=1,
max_iter=1000000, dual = False)))
models.append(('Kernel SVC', SVC(kernel = 'rbf', degree = 2, C=100.0,
random_state=1, max_iter = 1000000)))


In [42]:
from sklearn.model_selection import cross_val_score, KFold
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7, shuffle = True)
    cv_results = cross_val_score(model, X_scaled_numeric, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

NameError: name 'models' is not defined

In [35]:
y_train.dtypes
X_train.drop('launch_to_state_change', axis=1, inplace=True)

In [45]:
X_train.isnull().sum()
X_train.drop('launch_to_state_change', axis=1, inplace=True)

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, KFold
for i in range(1,10):
    ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=200,
    algorithm="SAMME.R", learning_rate=(i/10), random_state=42)
    score = cross_val_score(ada_clf, X_train, y_train, cv=5, n_jobs=-1)
    print(f'{score.mean()} achienved for AdaBoost with learning rate of {i/10}')

0.8043017267494699 achienved for AdaBoost with learning rate of 0.1
0.7952741593456528 achienved for AdaBoost with learning rate of 0.2
0.8087246289003331 achienved for AdaBoost with learning rate of 0.3
0.7740078764010906 achienved for AdaBoost with learning rate of 0.4
0.7813995758860951 achienved for AdaBoost with learning rate of 0.5
0.7884277491669192 achienved for AdaBoost with learning rate of 0.6
0.7243865495304453 achienved for AdaBoost with learning rate of 0.7
0.7763707967282641 achienved for AdaBoost with learning rate of 0.8
0.8030293850348379 achienved for AdaBoost with learning rate of 0.9


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [48]:
from sklearn.ensemble import RandomForestClassifier
for i in range(2, 11):
    rnd_forest = RandomForestClassifier(max_leaf_nodes=16, n_jobs=-1, random_state=42, max_depth=i, n_estimators=500)
    score = cross_val_score(rnd_forest, X_train, y_train,cv=5, n_jobs=-1)
    print(f"Random forest with {i} depth {score.mean()}")


Random forest with 2 depth 0.8394425931535897
Random forest with 3 depth 0.8580430172674947
Random forest with 4 depth 0.8656770675552863
Random forest with 5 depth 0.8667070584671311
Random forest with 6 depth 0.8694335049984854
Random forest with 7 depth 0.8720387761284458
Random forest with 8 depth 0.8724628900333233
Random forest with 9 depth 0.8724628900333233
Random forest with 10 depth 0.8726446531354135


In [None]:
from sklearn.ensemble import BaggingClassifier

for i in range(500,2001, 250):
    for a in range(1,5):
        bag_clf = BaggingClassifier(
            DecisionTreeClassifier(random_state=42, max_depth=a), n_estimators=i,  bootstrap=True, random_state=42, n_jobs=-1)
        score = cross_val_score(bag_clf, X_train, y_train, cv=5, n_jobs=-1)
        print(f"{score} achieved for Bagging with {i} estimators, {a} depth {score.mean()}")
