In [31]:
# Import the required libraries and dependencies
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

In [3]:
# Read the csv data into a dataframe
#If M1 Macbook Pro, use following to upload file to Google Colab
  #from google.colab import files
  #uploaded = files.upload()

df = pd.read_csv(
    Path('ks-projects-201801.csv')
)

# Review the dataframe
df.head()

Saving ks-projects-201801.csv to ks-projects-201801.csv


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
# Review the dataframe for null values
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [5]:
# Drop nonessential columns from the dataframe
df.drop(columns=['name', 'ID', 'deadline', 'launched'], inplace=True)

# Review the dataframe
df.head()

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,Poetry,Publishing,GBP,1000.0,0.0,failed,0,GB,0.0,0.0,1533.95
1,Narrative Film,Film & Video,USD,30000.0,2421.0,failed,15,US,100.0,2421.0,30000.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,failed,3,US,220.0,220.0,45000.0
3,Music,Music,USD,5000.0,1.0,failed,1,US,1.0,1.0,5000.0
4,Film & Video,Film & Video,USD,19500.0,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [6]:
# Review the target variable values
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [7]:
# Recategorize values in target variable column
df['state'].replace(['suspended', 'canceled'], 'failed', inplace=True)

# Drop nontargeted values in target variable column
df.drop(df.index[df['state'] == 'undefined'], inplace = True)
df.drop(df.index[df['state'] == 'live'], inplace = True)

# Review the altered target variable values
df['state'].value_counts()                              

                                


failed        238344
successful    133956
Name: state, dtype: int64

In [8]:
df.drop(df.index[df['state'] == 'live'], inplace = True)

In [9]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [10]:
# Define the categorical variables
categorical_variables = ['category', 'main_category', 'currency', 'country']

# Transform the categorical variables
encoded_data = enc.fit_transform(df[categorical_variables])

# Create a new dataframe with the encoded categorical variables
encoded_df = pd.DataFrame(encoded_data,columns=enc.get_feature_names_out(categorical_variables))

# Review the encoded dataframe
encoded_df.head()


Unnamed: 0,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Change target variable values from string values to binary values
df['state'].replace('successful', 1 , inplace=True)
df['state'].replace('failed', 0, inplace=True)

# Reset the index values of the original dataframe
df.reset_index(inplace=True)

# Review the original dataframe
df.tail()

Unnamed: 0,index,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
372295,378656,Documentary,Film & Video,USD,50000.0,25.0,0,1,US,25.0,25.0,50000.0
372296,378657,Narrative Film,Film & Video,USD,1500.0,155.0,0,5,US,155.0,155.0,1500.0
372297,378658,Narrative Film,Film & Video,USD,15000.0,20.0,0,1,US,20.0,20.0,15000.0
372298,378659,Technology,Technology,USD,15000.0,200.0,0,6,US,200.0,200.0,15000.0
372299,378660,Performance Art,Art,USD,2000.0,524.0,0,17,US,524.0,524.0,2000.0


In [12]:
# Create a new dataframe with the numerical values 
numerical_df = df.loc[:, ['goal', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'usd_goal_real', 'state']]

# Review the numerical variables dataframe
numerical_df.head()

Unnamed: 0,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,state
0,1000.0,0.0,0,0.0,0.0,1533.95,0
1,30000.0,2421.0,15,100.0,2421.0,30000.0,0
2,45000.0,220.0,3,220.0,220.0,45000.0,0
3,5000.0,1.0,1,1.0,1.0,5000.0,0
4,19500.0,1283.0,14,1283.0,1283.0,19500.0,0


In [13]:
# Concatenate the encoded and numerical dataframes into a combined dataframe
combined_df = pd.concat([encoded_df, numerical_df], axis=1)

# Replace NaN values in 'usd pledged' with values in 'usd_pledged_real'
combined_df['usd pledged'] = combined_df['usd pledged'].fillna(combined_df['usd_pledged_real'])

# Review the combined dataframe for null values
combined_df.isnull().sum()

category_3D Printing    0
category_Academic       0
category_Accessories    0
category_Action         0
category_Animals        0
                       ..
backers                 0
usd pledged             0
usd_pledged_real        0
usd_goal_real           0
state                   0
Length: 218, dtype: int64

In [14]:
# Define features set X by selecting all columns but DEFAULT
X = combined_df.drop('state', axis=1)
X.head()

# Define target set by selecting the 'state' column
y = combined_df['state']
y.head()

# Review the datatypes
combined_df.dtypes

category_3D Printing    float64
category_Academic       float64
category_Accessories    float64
category_Action         float64
category_Animals        float64
                         ...   
backers                   int64
usd pledged             float64
usd_pledged_real        float64
usd_goal_real           float64
state                     int64
Length: 218, dtype: object

In [15]:
# Declare a logistic regression model
logistic = LogisticRegression(random_state=1)

In [16]:
# Create training and testing datasets using train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y)

In [17]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit the model with the scaled data
logistic.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [18]:
# Make prediction using the testing data
y_pred = logistic.predict(X_test_scaled)

In [19]:
# Generate a classification report
report = classification_report(y_test, y_pred)

#Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.88      0.96      0.92    178800
           1       0.92      0.76      0.83    100425

    accuracy                           0.89    279225
   macro avg       0.90      0.86      0.87    279225
weighted avg       0.89      0.89      0.89    279225



In [20]:
# Define the the number of inputs (features) to the model
number_input_features = 217

# Define the number of neurons in the output layer
number_output_neurons = 1

In [21]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  109

# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  54

In [22]:
# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer
nn.add(Dense(units = hidden_nodes_layer1, input_dim = number_input_features, activation = 'relu'))

# Add the second hidden layer
nn.add(Dense(units = hidden_nodes_layer2, activation = 'relu'))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units = 1, activation = 'linear'))

In [None]:
# Compile the Sequential model
nn.compile(loss = 'binary_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

# Fit the model using 250 epochs and the training data
model = nn.fit(x = X_train_scaled, y = y_train, epochs = 80)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
 669/2909 [=====>........................] - ETA: 4s - loss: 0.2958 - accuracy: 0.9058

In [None]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [26]:
clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.8,max_depth=1, random_state=1).fit(X_train_scaled, y_train)

In [27]:
y_pred_4 = clf.predict(X_test_scaled)

In [29]:
print(classification_report(y_test, y_pred_4))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    178800
           1       0.98      1.00      0.99    100425

    accuracy                           0.99    279225
   macro avg       0.99      0.99      0.99    279225
weighted avg       0.99      0.99      0.99    279225



In [30]:
from google.colab import files
uploaded = files.upload()

Saving ks-projects-201612.csv to ks-projects-201612.csv


In [32]:
print(confusion_matrix(y_test, y_pred_4))

[[176632   2168]
 [   212 100213]]


In [33]:
X.columns

Index(['category_3D Printing', 'category_Academic', 'category_Accessories',
       'category_Action', 'category_Animals', 'category_Animation',
       'category_Anthologies', 'category_Apparel', 'category_Apps',
       'category_Architecture',
       ...
       'country_NZ', 'country_SE', 'country_SG', 'country_US', 'goal',
       'pledged', 'backers', 'usd pledged', 'usd_pledged_real',
       'usd_goal_real'],
      dtype='object', length=217)

In [35]:
feature_df = pd.DataFrame([clf.feature_importances_, X.columns])

In [39]:
feature_df = feature_df.transpose()

In [40]:
feature_df

Unnamed: 0,0,1
0,0.0,category_3D Printing
1,0.0,category_Academic
2,0.0,category_Accessories
3,0.000004,category_Action
4,0.0,category_Animals
...,...,...
212,0.02834,pledged
213,0.603835,backers
214,0.000184,usd pledged
215,0.162379,usd_pledged_real


In [42]:
feature_df.sort_values([0], ascending=False)

Unnamed: 0,0,1
213,0.603835,backers
216,0.163874,usd_goal_real
215,0.162379,usd_pledged_real
211,0.037579,goal
212,0.02834,pledged
...,...,...
80,0.0,category_Latin
81,0.0,category_Letterpress
82,0.0,category_Literary Journals
83,0.0,category_Literary Spaces
