In [54]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv(
    "ks-projects-201801.csv",
)

In [3]:
df.head()
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [4]:
df.drop(columns=['name', 'ID', 'deadline', 'launched'], inplace=True)

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,Poetry,Publishing,GBP,1000.0,0.0,failed,0,GB,0.0,0.0,1533.95
1,Narrative Film,Film & Video,USD,30000.0,2421.0,failed,15,US,100.0,2421.0,30000.00
2,Narrative Film,Film & Video,USD,45000.0,220.0,failed,3,US,220.0,220.0,45000.00
3,Music,Music,USD,5000.0,1.0,failed,1,US,1.0,1.0,5000.00
4,Film & Video,Film & Video,USD,19500.0,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...
378656,Documentary,Film & Video,USD,50000.0,25.0,canceled,1,US,25.0,25.0,50000.00
378657,Narrative Film,Film & Video,USD,1500.0,155.0,failed,5,US,155.0,155.0,1500.00
378658,Narrative Film,Film & Video,USD,15000.0,20.0,failed,1,US,20.0,20.0,15000.00
378659,Technology,Technology,USD,15000.0,200.0,failed,6,US,200.0,200.0,15000.00


In [6]:
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [7]:
df['state'].replace('suspended', 'failed', inplace=True)

In [8]:
df['state'].value_counts()

failed        199565
successful    133956
canceled       38779
undefined       3562
live            2799
Name: state, dtype: int64

In [9]:
df['state'].replace('canceled', 'failed', inplace=True)

In [10]:
df.drop(df.index[df['state'] == 'undefined'], inplace = True)

In [11]:
df.drop(df.index[df['state'] == 'live'], inplace = True)

In [12]:
df['state'].value_counts()

failed        238344
successful    133956
Name: state, dtype: int64

In [13]:
enc = OneHotEncoder(sparse=False)

In [14]:
categorical_variables = ['category', 'main_category', 'currency', 'country']

In [15]:
encoded_data = enc.fit_transform(df[categorical_variables])

In [16]:
encoded_df = pd.DataFrame(encoded_data,columns=enc.get_feature_names_out(categorical_variables))

In [17]:
encoded_df

Unnamed: 0,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
372296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
372297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
372298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
encoded_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [19]:
df['state'].replace('successful', 1 , inplace=True)

In [20]:
df['state'].replace('failed', 0, inplace=True)

In [21]:
df['state'].value_counts()
df.reset_index(inplace=True)
df.tail()

Unnamed: 0,index,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
372295,378656,Documentary,Film & Video,USD,50000.0,25.0,0,1,US,25.0,25.0,50000.0
372296,378657,Narrative Film,Film & Video,USD,1500.0,155.0,0,5,US,155.0,155.0,1500.0
372297,378658,Narrative Film,Film & Video,USD,15000.0,20.0,0,1,US,20.0,20.0,15000.0
372298,378659,Technology,Technology,USD,15000.0,200.0,0,6,US,200.0,200.0,15000.0
372299,378660,Performance Art,Art,USD,2000.0,524.0,0,17,US,524.0,524.0,2000.0


In [22]:
numerical_df = df.loc[:, ['goal', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'usd_goal_real', 'state']]

In [23]:
numerical_df

Unnamed: 0,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,state
0,1000.0,0.0,0,0.0,0.0,1533.95,0
1,30000.0,2421.0,15,100.0,2421.0,30000.00,0
2,45000.0,220.0,3,220.0,220.0,45000.00,0
3,5000.0,1.0,1,1.0,1.0,5000.00,0
4,19500.0,1283.0,14,1283.0,1283.0,19500.00,0
...,...,...,...,...,...,...,...
372295,50000.0,25.0,1,25.0,25.0,50000.00,0
372296,1500.0,155.0,5,155.0,155.0,1500.00,0
372297,15000.0,20.0,1,20.0,20.0,15000.00,0
372298,15000.0,200.0,6,200.0,200.0,15000.00,0


In [24]:
combined_df = pd.concat([encoded_df, numerical_df], axis=1)

In [46]:
combined_df.drop('usd pledged', axis=1, inplace=True)

In [47]:
X = combined_df.drop('state', axis=1)
X.head()

y = combined_df['state']
y.head()

combined_df.dtypes

category_3D Printing    float64
category_Academic       float64
category_Accessories    float64
category_Action         float64
category_Animals        float64
                         ...   
pledged                 float64
backers                   int64
usd_pledged_real        float64
usd_goal_real           float64
state                     int64
Length: 217, dtype: object

In [48]:
logistic = LogisticRegression()

In [49]:
X_test, X_train, y_test, y_train = train_test_split(X, y)

In [50]:
scaler = StandardScaler()
# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)
# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [51]:
logistic.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [52]:
y_pred = logistic.predict(X_test_scaled)

In [55]:
report = classification_report(y_test, y_pred)

In [56]:
print(report)

              precision    recall  f1-score   support

           0       0.86      0.96      0.91    178686
           1       0.92      0.73      0.81    100539

    accuracy                           0.88    279225
   macro avg       0.89      0.85      0.86    279225
weighted avg       0.88      0.88      0.88    279225

