In [1]:
import numpy as np
import pandas as pd

### Loading the dataset
df = pd.read_csv('project.csv')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [2]:
df.shape

(378661, 15)

### Printing unique values in our dataset

In [3]:
print(df.nunique())

ID                  378661
name                375764
category               159
main_category           15
currency                14
deadline              3164
goal                  8353
launched            378089
pledged              62130
state                    6
backers               3963
country                 23
usd pledged          95455
usd_pledged_real    106065
usd_goal_real        50339
dtype: int64


In [4]:
Success_dist = round(df["state"].value_counts() / len(df["state"]) * 100,2)

print("Success_dist in %: ")
print(Success_dist)

Success_dist in %: 
failed        52.22
successful    35.38
canceled      10.24
undefined      0.94
live           0.74
suspended      0.49
Name: state, dtype: float64


In [5]:
def convert_state(df):
    df.state.value_counts()
    df = df[df.state!='undefined']
    df = df[df.state!='live']
    df = df[df.state!='suspended']
    return df
df = convert_state(df)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [6]:
#Checking for NAN's in each columns
df.isnull().sum()

ID                    0
name                  3
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged         232
usd_pledged_real      0
usd_goal_real         0
dtype: int64

### Dropping unnecessary columns ID,Name(since it does not contribute to a project's failure or success), category(since we already have the main category), currency(same contribution as the country, so keeping only country), usd pledged(since it has 232 na's)

In [7]:
df = df.drop('name',axis=1)
df = df.drop('ID',axis=1)
df = df.drop('category',axis=1)
df = df.drop(['usd pledged'], axis=1)
df = df.drop(['currency'], axis=1)
df.head()

Unnamed: 0,main_category,deadline,goal,launched,pledged,state,backers,country,usd_pledged_real,usd_goal_real
0,Publishing,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,1533.95
1,Film & Video,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,2421.0,30000.0
2,Film & Video,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,45000.0
3,Music,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,5000.0
4,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,19500.0


### Removing rows with non-'NaN' value

In [8]:
df = df.dropna(axis=0)
df.shape

(370454, 10)

### Removing rows with N,0" in country column

In [9]:
df = df[df.country!='N,0"']
df.shape

(370222, 10)

### Calculating the duration of the project

In [10]:
import calendar
import datetime
df['deadline_year'] = df['deadline'].str[0:4]
df['deadline_month']= df['deadline'].str[5:7]
df['launched_year'] = df['launched'].str[0:4]
df['launched_month'] = df['launched'].str[5:7]
df['deadline_year'] = df['deadline_year'].astype(int)
df['deadline_month'] = df['deadline_month'].astype(int)
df['launched_year'] = df['launched_year'].astype(int)
df['launched_month'] = df['launched_month'].astype(int)

df['deadline_year']

0         2015
1         2017
2         2013
3         2012
4         2015
5         2016
6         2014
7         2016
8         2014
9         2014
10        2013
11        2013
12        2014
13        2016
14        2017
15        2015
16        2014
17        2015
18        2012
19        2012
20        2013
21        2017
22        2014
23        2015
24        2014
25        2011
26        2016
27        2014
28        2016
29        2017
          ... 
378631    2014
378632    2014
378633    2014
378634    2011
378635    2015
378636    2014
378637    2017
378638    2014
378639    2015
378640    2014
378641    2015
378642    2017
378643    2014
378644    2015
378645    2013
378646    2013
378647    2012
378648    2016
378649    2017
378650    2016
378651    2014
378652    2015
378653    2012
378654    2017
378655    2014
378656    2014
378657    2011
378658    2010
378659    2016
378660    2011
Name: deadline_year, Length: 370222, dtype: int64

In [11]:
df['duration_proj(in months)'] = (df['deadline_year']-df['launched_year'])*12 + (df['deadline_month']-df['launched_month'])
df = df.drop('deadline_year',axis=1)
df = df.drop('deadline_month',axis=1)
df = df.drop('launched_year',axis=1)
df = df.drop('launched_month',axis=1)
df = df.drop('launched',axis=1)
df = df.drop('deadline',axis=1)

In [12]:
df.head()

Unnamed: 0,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,Film & Video,19500.0,1283.0,canceled,14,US,1283.0,19500.0,1


In [13]:
print(df.nunique())

main_category                   15
goal                          8233
pledged                      61556
state                            3
backers                       3940
country                         22
usd_pledged_real            104287
usd_goal_real                49309
duration_proj(in months)        10
dtype: int64


### Applying one hot encoding to categorical columns main_category and country

In [14]:
onehot = pd.get_dummies(df['main_category'])
df = onehot.join(df)
df.shape

(370222, 24)

In [15]:
onehot = pd.get_dummies(df['country'])
df = onehot.join(df)
df.shape

(370222, 46)

In [16]:
from sklearn.preprocessing import LabelEncoder
y = df[['state']]
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y.values.ravel())
print(y)

[1 1 1 ... 1 1 1]


### Applying label encoding the class label - 'state'

In [17]:
df.head()

Unnamed: 0,AT,AU,BE,CA,CH,DE,DK,ES,FR,GB,...,Theater,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,0,0,0,0,0,0,0,0,0,1,...,0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,19500.0,1283.0,canceled,14,US,1283.0,19500.0,1


In [19]:
list(df.columns.values)

['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'SE',
 'SG',
 'US',
 'Art',
 'Comics',
 'Crafts',
 'Dance',
 'Design',
 'Fashion',
 'Film & Video',
 'Food',
 'Games',
 'Journalism',
 'Music',
 'Photography',
 'Publishing',
 'Technology',
 'Theater',
 'main_category',
 'goal',
 'pledged',
 'state',
 'backers',
 'country',
 'usd_pledged_real',
 'usd_goal_real',
 'duration_proj(in months)']

### Label encoding for class label 'state'. Removing main_category and country after one hot encoding

In [18]:
col = ['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'SE',
 'SG',
 'US',
 'Art',
 'Comics',
 'Crafts',
 'Dance',
 'Design',
 'Fashion',
 'Film & Video',
 'Food',
 'Games',
 'Journalism',
 'Music',
 'Photography',
 'Publishing',
 'Technology',
 'Theater',
 'goal',
 'pledged',
 'backers',
 'usd_pledged_real',
 'usd_goal_real',
 'duration_proj(in months)']
X = df[col]
df.head()

Unnamed: 0,AT,AU,BE,CA,CH,DE,DK,ES,FR,GB,...,Theater,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,0,0,0,0,0,0,0,0,0,1,...,0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,19500.0,1283.0,canceled,14,US,1283.0,19500.0,1


In [19]:
X.shape

(370222, 43)

In [20]:
y.shape

(370222,)

In [21]:
!pip3 install imblearn
!pip install scipy
from imblearn.over_sampling import ADASYN
ad = ADASYN()
X, y = ad.fit_sample(X, y)
print(X)

[[0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 1.53395e+03
  2.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 2.42100e+03 3.00000e+04
  2.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 2.20000e+02 4.50000e+04
  1.00000e+00]
 ...
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]]


In [22]:
X.shape

(592067, 43)

In [23]:
y.shape

(592067,)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)
print(y_train)

[2 0 1 ... 1 0 2]


### Normalizing the features 

In [25]:
import math 
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler() 
X_train = sc_X.fit_transform(X_train) 
X_test = sc_X.transform(X_test)

In [27]:
#Applying PCA
from sklearn.decomposition import PCA
pcaObj= PCA(n_components=38)
X_train= pcaObj.fit_transform(X_train)
X_test= pcaObj.transform(X_test)
components_variance= pcaObj.explained_variance_ratio_

In [28]:
components_variance

array([0.06106671, 0.04502402, 0.0443149 , 0.02818982, 0.02690523,
       0.02676104, 0.0264464 , 0.02588661, 0.02575477, 0.02544136,
       0.02537637, 0.02493325, 0.02481024, 0.02441966, 0.02408297,
       0.0239591 , 0.02384446, 0.02368212, 0.0235307 , 0.02346488,
       0.02333421, 0.02327425, 0.02324707, 0.02322357, 0.02314931,
       0.02310744, 0.02305875, 0.02301711, 0.02299599, 0.02292913,
       0.022867  , 0.02283005, 0.02275528, 0.02269946, 0.02256696,
       0.02245234, 0.02212664, 0.02160231])

In [None]:
#kernel PCA
from sklearn.decomposition import KernelPCA
kernelPCAObj= KernelPCA(n_components=8, kernel='rbf')
X_train= kernelPCAObj.fit_transform(X_train)
X_test= kernelPCAObj.transform(X_test)

### Fitting Logistic Regression to Training Set

In [None]:
from sklearn.linear_model import LogisticRegression
classifierObj= LogisticRegression(random_state=0)
classifierObj.fit(X_train, y_train)
#Making predictions on the Test Set
y_pred= classifierObj.predict(X_test)

### Evaluating the predictions using a Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
#Model accuracy -> model got correct results
print("Accuracy of Logistic Regression model is: ")
classifierObj.score(X_test, y_test)

### kNN

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.neighbors import KNeighborsClassifier
classifierObj= KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred= classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix

#Model accuracy -> model got correct results
print("Accuracy of kNN model is: ")
classifierObj.score(X_test, y_test)

### Random Forest

In [29]:
#Creating a pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
pipe_lr= make_pipeline(StandardScaler(), PCA(n_components=38), RandomForestClassifier())
pipe_lr.fit(X_train, y_train)
y_pred= pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))



Test Accuracy: 0.822


In [31]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.ensemble import RandomForestClassifier
classifierObj = RandomForestClassifier(criterion='entropy')
classifierObj.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of Random Forest model is: ")
classifierObj.score(X_test, y_test)



Accuracy of Random Forest model is: 


0.8345972604590673

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.naive_bayes import GaussianNB
classifierObj= GaussianNB()
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of Naive Baeyes model is: ")
classifierObj.score(X_test, y_test)

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifierObj= GaussianNB()
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of Naive Baeyes model is: ")
classifierObj.score(X_test, y_test)

### SVM - RBF

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.svm import SVC
classifierObj= SVC()
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of SVM(RBF) model is: ")
classifierObj.score(X_test, y_test)

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.svm import SVC
classifierObj= SVC(kernel='poly', degree=3)
classifierObj.fit(X_train, y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of SVM(POLY) model is: ")
classifierObj.score(X_test, y_test)