In [1]:
# Import the required libraries and dependencies
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [2]:
# Read the csv data into a dataframe
df = pd.read_csv(
    Path('ks-projects-201801.csv')
)

# Review the dataframe
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [3]:
# Review the dataframe for null values
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [4]:
# Drop nonessential columns from the dataframe
df.drop(columns=['name', 'ID', 'deadline', 'launched'], inplace=True)

# Review the dataframe
df.head()

Unnamed: 0,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,Poetry,Publishing,GBP,1000.0,0.0,failed,0,GB,0.0,0.0,1533.95
1,Narrative Film,Film & Video,USD,30000.0,2421.0,failed,15,US,100.0,2421.0,30000.0
2,Narrative Film,Film & Video,USD,45000.0,220.0,failed,3,US,220.0,220.0,45000.0
3,Music,Music,USD,5000.0,1.0,failed,1,US,1.0,1.0,5000.0
4,Film & Video,Film & Video,USD,19500.0,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [5]:
# Review the target variable values
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [6]:
# Recategorize values in target variable column
df['state'].replace(['suspended', 'canceled'], 'failed', inplace=True)

# Drop nontargeted values in target variable column
df.drop(df.index[df['state'] == 'undefined'], inplace = True)
df.drop(df.index[df['state'] == 'live'], inplace = True)

# Review the altered target variable values
df['state'].value_counts()                              

                                


failed        238344
successful    133956
Name: state, dtype: int64

In [7]:
df.drop(df.index[df['state'] == 'live'], inplace = True)

In [8]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [9]:
# Define the categorical variables
categorical_variables = ['category', 'main_category', 'currency', 'country']

# Transform the categorical variables
encoded_data = enc.fit_transform(df[categorical_variables])

# Create a new dataframe with the encoded categorical variables
encoded_df = pd.DataFrame(encoded_data,columns=enc.get_feature_names_out(categorical_variables))

# Review the encoded dataframe
encoded_df.head()


Unnamed: 0,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Change target variable values from string values to binary values
df['state'].replace('successful', 1 , inplace=True)
df['state'].replace('failed', 0, inplace=True)

# Reset the index values of the original dataframe
df.reset_index(inplace=True)

# Review the original dataframe
df.tail()

Unnamed: 0,index,category,main_category,currency,goal,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
372295,378656,Documentary,Film & Video,USD,50000.0,25.0,0,1,US,25.0,25.0,50000.0
372296,378657,Narrative Film,Film & Video,USD,1500.0,155.0,0,5,US,155.0,155.0,1500.0
372297,378658,Narrative Film,Film & Video,USD,15000.0,20.0,0,1,US,20.0,20.0,15000.0
372298,378659,Technology,Technology,USD,15000.0,200.0,0,6,US,200.0,200.0,15000.0
372299,378660,Performance Art,Art,USD,2000.0,524.0,0,17,US,524.0,524.0,2000.0


In [11]:
# Create a new dataframe with the numerical values 
numerical_df = df.loc[:, ['goal', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'usd_goal_real', 'state']]

# Review the numerical variables dataframe
numerical_df.head()

Unnamed: 0,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,state
0,1000.0,0.0,0,0.0,0.0,1533.95,0
1,30000.0,2421.0,15,100.0,2421.0,30000.0,0
2,45000.0,220.0,3,220.0,220.0,45000.0,0
3,5000.0,1.0,1,1.0,1.0,5000.0,0
4,19500.0,1283.0,14,1283.0,1283.0,19500.0,0


In [12]:
# Concatenate the encoded and numerical dataframes into a combined dataframe
combined_df = pd.concat([encoded_df, numerical_df], axis=1)

# Replace NaN values in 'usd pledged' with values in 'usd_pledged_real'
combined_df['usd pledged'] = combined_df['usd pledged'].fillna(combined_df['usd_pledged_real'])

# Review the combined dataframe for null values
combined_df.isnull().sum()

category_3D Printing    0
category_Academic       0
category_Accessories    0
category_Action         0
category_Animals        0
                       ..
backers                 0
usd pledged             0
usd_pledged_real        0
usd_goal_real           0
state                   0
Length: 218, dtype: int64

In [13]:
# Define features set X by selecting all columns but DEFAULT
X = combined_df.drop('state', axis=1)
X.head()

# Define target set by selecting the 'state' column
y = combined_df['state']
y.head()

# Review the datatypes
combined_df.dtypes

category_3D Printing    float64
category_Academic       float64
category_Accessories    float64
category_Action         float64
category_Animals        float64
                         ...   
backers                   int64
usd pledged             float64
usd_pledged_real        float64
usd_goal_real           float64
state                     int64
Length: 218, dtype: object

In [14]:
# Declare a logistic regression model
logistic = LogisticRegression(random_state=1)

In [15]:
# Create training and testing datasets using train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y)

In [16]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit the model with the scaled data
logistic.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [17]:
# Make prediction using the testing data
y_pred = logistic.predict(X_test_scaled)

In [18]:
# Generate a classification report
report = classification_report(y_test, y_pred)

#Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.87      0.96      0.91    178470
           1       0.92      0.75      0.82    100755

    accuracy                           0.89    279225
   macro avg       0.90      0.86      0.87    279225
weighted avg       0.89      0.89      0.88    279225



In [19]:
# Define the the number of inputs (features) to the model
number_input_features = 217

# Define the number of neurons in the output layer
number_output_neurons = 1

In [20]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 =  109

# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 =  54

In [21]:
# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer
nn.add(Dense(units = hidden_nodes_layer1, input_dim = number_input_features, activation = 'relu'))

# Add the second hidden layer
nn.add(Dense(units = hidden_nodes_layer2, activation = 'relu'))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units = 1, activation = 'linear'))

In [22]:
# Compile the Sequential model
nn.compile(loss = 'binary_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

# Fit the model using 250 epochs and the training data
model = nn.fit(x = X_train_scaled, y = y_train, verbose = 0, epochs = 250)

In [23]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.23915575444698334, Accuracy: 0.9275960326194763
