In [1]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv(
    "ks-projects-201801.csv",
)

In [3]:
df.drop(columns=['name', 'ID', 'deadline', 'launched'], inplace=True)

In [4]:
enc = OneHotEncoder(sparse=False)

In [5]:
categorical_variables = ['category', 'main_category', 'currency', 'country']

In [6]:
encoded_data = enc.fit_transform(df[categorical_variables])

In [7]:
encoded_df = pd.DataFrame(encoded_data,columns=enc.get_feature_names_out(categorical_variables))

In [8]:
encoded_df

Unnamed: 0,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
378657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
378658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
378659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
df.reset_index(inplace=True)

In [10]:
numerical_df = df.loc[:, ['goal', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'usd_goal_real', 'state']]

In [11]:
combined_df = pd.concat([encoded_df, numerical_df], axis=1)

In [12]:
combined_df["usd pledged"] = combined_df["usd pledged"].fillna(combined_df["usd_pledged_real"])

In [13]:
combined_df.isnull().sum()

category_3D Printing    0
category_Academic       0
category_Accessories    0
category_Action         0
category_Animals        0
                       ..
backers                 0
usd pledged             0
usd_pledged_real        0
usd_goal_real           0
state                   0
Length: 218, dtype: int64

In [14]:
X = combined_df.drop('state', axis=1)
X.head()

y = combined_df['state']
y.head()

combined_df.dtypes

category_3D Printing    float64
category_Academic       float64
category_Accessories    float64
category_Action         float64
category_Animals        float64
                         ...   
backers                   int64
usd pledged             float64
usd_pledged_real        float64
usd_goal_real           float64
state                    object
Length: 218, dtype: object

In [15]:
logistic = LogisticRegression()

In [16]:
X_test, X_train, y_test, y_train = train_test_split(X, y)

In [17]:
scaler = StandardScaler()
# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)
# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.8,max_depth=1, random_state=1).fit(X_train, y_train)

In [19]:
y_pred = clf.predict(X_test_scaled)

  "X does not have valid feature names, but"


In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    canceled       0.19      0.06      0.09     29296
      failed       0.53      0.98      0.69    148175
        live       0.16      0.00      0.00      2135
  successful       1.00      0.00      0.00    100323
   suspended       0.09      0.02      0.03      1378
   undefined       0.93      1.00      0.97      2688

    accuracy                           0.53    283995
   macro avg       0.48      0.34      0.30    283995
weighted avg       0.66      0.53      0.38    283995

