In [2]:
import pandas as pd

In [3]:
all_cols = ['Unnamed: 0', 'id', 'photo', 'name', 'blurb', 'goal', 'pledged',
       'state', 'slug', 'disable_communication', 'country', 'currency',
       'currency_symbol', 'currency_trailing_code', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'staff_pick',
       'backers_count', 'static_usd_rate', 'usd_pledged', 'creator',
       'location', 'category', 'profile', 'spotlight', 'urls', 'source_url',
       'friends', 'is_starred', 'is_backing', 'permissions', 'name_len',
       'name_len_clean', 'blurb_len', 'blurb_len_clean', 'deadline_weekday',
       'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday',
       'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr',
       'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr',
       'state_changed_at_hr', 'created_at_month', 'created_at_day',
       'created_at_yr', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch', 'launch_to_deadline', 'launch_to_state_change']
cols_to_drop = ['Unnamed: 0','urls','name','blurb','id','slug',
                'photo','currency_trailing_code','friends','is_starred','is_backing','permissions','name_len','blurb_len','profile','static_usd_rate']
datetime_columns = ['state_changed_at','created_at', 'launched_at','deadline']

year_cols = ['created_at_yr', 'deadline_yr','state_changed_at_yr', 'launched_at_yr']

month_day_hr_cols = ['state_changed_at_month','created_at_day','created_at_month', 'created_at_hr', 'launched_at_month','launched_at_day','launched_at_hr',
        'deadline_month','deadline_day','deadline_hr', 'state_changed_at_day', 'state_changed_at_hr', ]
cat = ['deadline_weekday', 'created_at_weekday', 'state_changed_at_weekday', 'currency_symbol'
       'country', 'currency','source_url',
       'state','launched_at_weekday'
       ]

dtype_dict = {
    **{col: 'int16' for col in year_cols},
    **{col: 'int8' for col in month_day_hr_cols},
    **{col: 'category' for col in cat},
    'backers_count':'int32'
}

for i in datetime_columns:
    cols_to_drop.append(i)
cols_to_keep = [col for col in all_cols if col not in cols_to_drop]

In [4]:
df = pd.read_csv("kickstarter_data_with_features.csv", usecols=cols_to_keep, dtype=dtype_dict)

df.columns

Index(['goal', 'pledged', 'state', 'disable_communication', 'country',
       'currency', 'currency_symbol', 'staff_pick', 'backers_count',
       'usd_pledged', 'creator', 'location', 'category', 'spotlight',
       'source_url', 'name_len_clean', 'blurb_len_clean', 'deadline_weekday',
       'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday',
       'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr',
       'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr',
       'state_changed_at_hr', 'created_at_month', 'created_at_day',
       'created_at_yr', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch', 'launch_to_deadline', 'launch_to_state_change'],
      dtype='object')

In [5]:
df['currency_symbol'] = df['currency_symbol'].astype('category').cat.codes
df['country'] = df['country'].astype('category').cat.codes
df['currency'] = df['currency'].astype('category').cat.codes
df['state'] = df['state'].astype('category').cat.codes
df['source_url'] = df['source_url'].astype('category').cat.codes #this probably has high correlation with category

In [6]:
df['launch_to_deadline'] = df['launch_to_deadline'].astype(str).str.extract(r'(\d+ days)')
df['create_to_launch'] = df['create_to_launch'].astype(str).str.extract(r'(\d+ days)')
df['launch_to_state_change'] = df['launch_to_state_change'].astype(str).str.extract(r'(\d+ days)')

In [7]:
df['name_len_clean'] = df['name_len_clean'].fillna(df['name_len_clean'].median())
df['name_len_clean'] = df['name_len_clean'].astype('int16')
df['blurb_len_clean'] = df['blurb_len_clean'].fillna(df['blurb_len_clean'].median())
df['blurb_len_clean'] = df['blurb_len_clean'].astype('int16')



In [8]:
df['launch_to_deadline'] = df['launch_to_deadline'].astype(str).str.extract(r'(\d+)')
df['create_to_launch'] = df['create_to_launch'].astype(str).str.extract(r'(\d+)')
df['launch_to_state_change'] = df['launch_to_state_change'].astype(str).str.extract(r'(\d+)')

In [9]:
df['launch_to_deadline'] = pd.to_numeric(df['launch_to_deadline'])
df['create_to_launch'] = pd.to_numeric(df['create_to_launch'])
df['launch_to_state_change'] = pd.to_numeric(df['launch_to_state_change'])

In [10]:
df['category'] = df['category'].astype('category').cat.codes

df['deadline_weekday'] = df['deadline_weekday'].cat.codes
df['state_changed_at_weekday'] = df['state_changed_at_weekday'].cat.codes
df['created_at_weekday'] = df['created_at_weekday'].cat.codes
df['launched_at_weekday'] = df['launched_at_weekday'].cat.codes


In [138]:
print(df.dtypes)

goal                        float64
pledged                     float64
state                          int8
disable_communication          bool
country                        int8
currency                       int8
currency_symbol                int8
staff_pick                     bool
backers_count                 int32
usd_pledged                 float64
creator                      object
location                     object
category                       int8
spotlight                      bool
source_url                     int8
name_len_clean                int16
blurb_len_clean               int16
deadline_weekday               int8
state_changed_at_weekday       int8
created_at_weekday             int8
launched_at_weekday            int8
deadline_month                 int8
deadline_day                   int8
deadline_yr                   int16
deadline_hr                    int8
state_changed_at_month         int8
state_changed_at_day           int8
state_changed_at_yr         

In [139]:
print(df.nunique())

goal                         1204
pledged                      8722
state                           5
disable_communication           2
country                        21
currency                       13
currency_symbol                 5
staff_pick                      2
backers_count                1505
usd_pledged                 12235
creator                     20522
location                     5178
category                       25
spotlight                       2
source_url                     33
name_len_clean                 14
blurb_len_clean                29
deadline_weekday                7
state_changed_at_weekday        7
created_at_weekday              7
launched_at_weekday             7
deadline_month                 12
deadline_day                   31
deadline_yr                     9
deadline_hr                    24
state_changed_at_month         12
state_changed_at_day           31
state_changed_at_yr             9
state_changed_at_hr            24
created_at_mon

In [15]:
df['percent_of_goal'] = df['pledged']/df['goal']
df['percent_of_goal'].describe()

count    20632.000000
mean         3.232088
std        177.318223
min          0.000000
25%          0.001500
50%          0.065439
75%          1.030000
max      22603.000000
Name: percent_of_goal, dtype: float64

In [141]:
achieved_goal = df[df['percent_of_goal'] >=1]
achieved_goal.head()

Unnamed: 0,goal,pledged,state,disable_communication,country,currency,currency_symbol,staff_pick,backers_count,usd_pledged,...,created_at_yr,created_at_hr,launched_at_month,launched_at_day,launched_at_yr,launched_at_hr,create_to_launch,launch_to_deadline,launch_to_state_change,percent_of_goal
180,5000.0,5087.0,3,False,20,12,0,False,74,5087.0,...,2015,20,3,3,2015,7,33,45,45.0,1.0174
181,3000.0,3007.56,3,False,20,12,0,False,72,3007.56,...,2013,21,5,1,2013,9,8,30,30.0,1.00252
182,3000.0,4410.0,3,False,20,12,0,False,73,4410.0,...,2012,16,6,6,2012,17,4,10,10.0,1.47
183,2000.0,2041.0,3,False,20,12,0,False,34,2041.0,...,2014,7,2,28,2014,11,0,30,30.0,1.0205
184,2500.0,2524.0,3,False,20,12,0,False,32,2524.0,...,2015,8,6,12,2015,13,22,21,21.0,1.0096


In [142]:
num_success_goal = achieved_goal[achieved_goal['state'] == 3]['state'].count()
total_reached_goal = achieved_goal.shape[0]
failure_rate_when_goal_reached = (total_reached_goal-num_success_goal) / total_reached_goal
num_success_total = df[df['state'] == 3]['state'].count()
num_success_no_goal = num_success_total-num_success_goal
percent_success = num_success_total/df.shape[0]

print(f"Total success rate {percent_success}")
print(f"Success rate when goal reached {num_success_goal/total_reached_goal}")
print(f"Failure rate when goal reached: {failure_rate_when_goal_reached}")
print(f"numbers of success when goal reached: {num_success_goal}")
print(f"number of failure when goal reached: {total_reached_goal-num_success_goal}")
print(f'Number of goals reached: {total_reached_goal}')
print(f"Percent that reached goal {total_reached_goal/df.shape[0]}")
print(f'Success that didnt reach goal {num_success_no_goal}')
print(f"Success rate when goal isn't reached {num_success_no_goal/(df.shape[0]-achieved_goal.shape[0])}")


Total success rate 0.2916828227995347
Success rate when goal reached 0.9717422896818989
Failure rate when goal reached: 0.028257710318101082
numbers of success when goal reached: 6018
number of failure when goal reached: 175
Number of goals reached: 6193
Percent that reached goal 0.30016479255525397
Success that didnt reach goal 0
Success rate when goal isn't reached 0.0


In [11]:
X = df.drop(columns=['state','creator','location','launch_to_state_change', 'percent_of_goal'], axis=1)
Y = df['state']

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X1 = scaler.fit_transform(X)
# X_scaled = pd.DataFrame(X1, columns=X.columns)
numeric_cols = X.select_dtypes(include=['number']).columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns

# Scale only numeric columns
scaler = StandardScaler()
X_scaled_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)

# Combine scaled numeric columns with untransformed object columns
X_scaled = pd.concat([X_scaled_numeric, X[non_numeric_cols]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [13]:
# X_train.head()
# X_train.drop('launch_to_state_change', axis=1, inplace=True)
# X_test.drop('launch_to_state_change', axis=1, inplace=True)


In [14]:
from sklearn.svm import SVC, LinearSVC
models = []
models.append(('Linear SVC', LinearSVC(C=100, loss = 'hinge', random_state=1,
max_iter=1000000)))
models.append(('Kernel SVC', SVC(kernel = 'rbf', degree = 2, C=100.0,
random_state=1, max_iter = 1000000)))


In [147]:
X_train.isnull().sum()


goal                        0
pledged                     0
country                     0
currency                    0
currency_symbol             0
backers_count               0
usd_pledged                 0
category                    0
source_url                  0
name_len_clean              0
blurb_len_clean             0
deadline_weekday            0
state_changed_at_weekday    0
created_at_weekday          0
launched_at_weekday         0
deadline_month              0
deadline_day                0
deadline_yr                 0
deadline_hr                 0
state_changed_at_month      0
state_changed_at_day        0
state_changed_at_yr         0
state_changed_at_hr         0
created_at_month            0
created_at_day              0
created_at_yr               0
created_at_hr               0
launched_at_month           0
launched_at_day             0
launched_at_yr              0
launched_at_hr              0
create_to_launch            0
launch_to_deadline          0
percent_of

In [148]:
from sklearn.model_selection import cross_val_score, KFold
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7, shuffle = True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



Linear SVC: 0.957407 (0.004322)
Kernel SVC: 0.968192 (0.004590)


In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
kernel_svc = SVC(kernel = 'rbf', degree = 2, C=100.0, random_state=1, max_iter = 1000000)
kernel_svc.fit(X_train, y_train)
y_pred = kernel_svc.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))

[[ 406   66    4    1    0]
 [  16 2274    5    1    0]
 [   2    1   87    0    0]
 [   0    0    0 1220    0]
 [   1    1    1    0   41]]


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf'],
              'max_iter': [10000, 100000,1000000, 2000000, 3000000, 5000000, 10000000]}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)

print(grid.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.553 total time=  23.8s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.553 total time=  24.3s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.553 total time=  23.7s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.553 total time=  28.3s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.553 total time=  27.5s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.688 total time=  13.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.682 total time=  11.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.691 total time=  11.7s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.688 total time=  11.9s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.678 total time=  13.5s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.849 total time=   8.7s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, KFold
for i in range(1,10):
    ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=4), n_estimators=200,
    algorithm="SAMME.R", learning_rate=(i/10), random_state=42)
    score = cross_val_score(ada_clf, X_train, y_train, cv=5, n_jobs=-1)
    print(f'{score.mean()} achienved for AdaBoost with learning rate of {i/10}')

0.9398364132081187 achienved for AdaBoost with learning rate of 0.1
0.8704634959103302 achienved for AdaBoost with learning rate of 0.2
0.9082702211451075 achienved for AdaBoost with learning rate of 0.3
0.922932444713723 achienved for AdaBoost with learning rate of 0.4
0.9369887912753712 achienved for AdaBoost with learning rate of 0.5
0.9517116025446836 achienved for AdaBoost with learning rate of 0.6
0.9418963950318087 achienved for AdaBoost with learning rate of 0.7
0.9291729778854891 achienved for AdaBoost with learning rate of 0.8
0.9219630415025749 achienved for AdaBoost with learning rate of 0.9


In [27]:
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), n_estimators=200,algorithm="SAMME.R", learning_rate=.6, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

In [32]:
from sklearn.ensemble import RandomForestClassifier
for i in range(2, 11):
    rnd_forest = RandomForestClassifier(max_leaf_nodes=20, n_jobs=-1, random_state=42, max_depth=i, n_estimators=500)
    score = cross_val_score(rnd_forest, X_train, y_train,cv=5, n_jobs=-1)
    print(f"Random forest with {i} depth {score.mean()}")


Random forest with 2 depth 0.8394425931535897
Random forest with 3 depth 0.8580430172674947
Random forest with 4 depth 0.8656770675552863
Random forest with 5 depth 0.8671917600727053
Random forest with 6 depth 0.8734322932444714
Random forest with 7 depth 0.8750681611632839
Random forest with 8 depth 0.8754316873674645
Random forest with 9 depth 0.8752499242653741
Random forest with 10 depth 0.8752499242653741


In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
results = []
for i in range(1500,3001, 750):
    bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42, max_depth=6), n_estimators=i,  bootstrap=True, random_state=42, n_jobs=-1)
    score = cross_val_score(bag_clf, X_train, y_train, cv=5, n_jobs=-1)
    results.append((score.mean(), i))
for score,  estimator in results:
    print(f'{score:.4f} estimator: {estimator}')

0.8825 estimator: 1500
0.8825 estimator: 2250
0.8825 estimator: 3000


In [None]:
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, SGDClassifier
from sklearn.feature_selection import RFE
lr_scores = []
for i in range(X_train.shape[1]-1, 0, -1):
    logistic_regression = RFE(LogisticRegression(random_state=42, max_iter=500), n_features_to_select=i)
    logistic_regression.fit(X_train, y_train)
    score = cross_val_score(logistic_regression, X_train, y_train, cv=10, n_jobs=-1)
    lr_scores.append((i, score.mean()))

for features, score in lr_scores:
    print(f'num features: {features} score: {score:.4f}')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt