### Loading the dataset

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('input/ks-projects-201801.csv')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [2]:
df.shape

(378661, 15)

### Printing unique values in each columns of our dataset

In [4]:
print(df.nunique())

ID                  378661
name                375764
category               159
main_category           15
currency                14
deadline              3164
goal                  8353
launched            378089
pledged              62130
state                    6
backers               3963
country                 23
usd pledged          95455
usd_pledged_real    106065
usd_goal_real        50339
dtype: int64


### Printing the percentage of each categories in our class label "state"

In [5]:
Success_dist = df["state"].value_counts() / len(df["state"]) * 100

print("Success_dist in %: ")
print(Success_dist)

Success_dist in %: 
failed        52.215306
successful    35.376234
canceled      10.241086
undefined      0.940683
live           0.739184
suspended      0.487507
Name: state, dtype: float64


### Dropping all the rows where state labels are undefined, live and suspended since there percentage in the datacet is really less. Also considering the canceled projects as failed project.

In [3]:
def convert_state(df):
    df.state.value_counts()
    df = df[df.state!='undefined']
    df = df[df.state!='live']
    df = df[df.state!='suspended']
    df['state']=df['state'].replace({'canceled':'failed'})
    return df
df = convert_state(df)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,failed,14,US,1283.0,1283.0,19500.0


### Checking for NAN's in each columns


In [7]:
df.isnull().sum()

ID                    0
name                  3
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged         232
usd_pledged_real      0
usd_goal_real         0
dtype: int64

### Dropping unnecessary columns ID,Name(since it does not contribute to a project's failure or success), category(since we already have the main category), currency(same contribution as the country, so keeping only country), usd_pledged(since it has 232 na's and usd_pledged_real have the similar data as usd_pledged).

In [4]:
df = df.drop('name',axis=1)
df = df.drop('ID',axis=1)
df = df.drop('category',axis=1)
df = df.drop(['usd pledged'], axis=1)
df = df.drop(['currency'], axis=1)
df.head()

Unnamed: 0,main_category,deadline,goal,launched,pledged,state,backers,country,usd_pledged_real,usd_goal_real
0,Publishing,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,1533.95
1,Film & Video,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,2421.0,30000.0
2,Film & Video,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,45000.0
3,Music,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,5000.0
4,Film & Video,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,failed,14,US,1283.0,19500.0


### More data cleaning - Removing rows with 'NaN' value and Removing rows with N,0" in country column

In [9]:
df = df.dropna(axis=0)
df = df[df.country!='N,0"']
df.shape

(370222, 10)

### Calculating the duration of each project

In [10]:
import calendar
import datetime
df['deadline_year'] = df['deadline'].str[0:4]
df['deadline_month']= df['deadline'].str[5:7]
df['launched_year'] = df['launched'].str[0:4]
df['launched_month'] = df['launched'].str[5:7]
df['deadline_year'] = df['deadline_year'].astype(int)
df['deadline_month'] = df['deadline_month'].astype(int)
df['launched_year'] = df['launched_year'].astype(int)
df['launched_month'] = df['launched_month'].astype(int)
df['duration_proj(in months)'] = (df['deadline_year']-df['launched_year'])*12 + (df['deadline_month']-df['launched_month'])
df = df.drop('deadline_year',axis=1)
df = df.drop('deadline_month',axis=1)
df = df.drop('launched_year',axis=1)
df = df.drop('launched_month',axis=1)
df = df.drop('launched',axis=1)
df = df.drop('deadline',axis=1)
df.head()

Unnamed: 0,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,Film & Video,19500.0,1283.0,failed,14,US,1283.0,19500.0,1


### Applying one hot encoding to categorical columns main_category and country

In [11]:
onehot = pd.get_dummies(df['main_category'])
df = onehot.join(df)
df.shape

(370222, 24)

In [12]:
onehot = pd.get_dummies(df['country'])
df = onehot.join(df)
df.shape

(370222, 46)

### Applying label encoding the class label - 'state'

In [13]:
from sklearn.preprocessing import LabelEncoder
y = df[['state']]
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y.values.ravel())
print(y)

[0 0 0 ... 0 0 0]


In [14]:
df.head()

Unnamed: 0,AT,AU,BE,CA,CH,DE,DK,ES,FR,GB,...,Theater,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,0,0,0,0,0,0,0,0,0,1,...,0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,19500.0,1283.0,failed,14,US,1283.0,19500.0,1


### Getting the list of all columns

In [15]:
list(df.columns.values)

['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'SE',
 'SG',
 'US',
 'Art',
 'Comics',
 'Crafts',
 'Dance',
 'Design',
 'Fashion',
 'Film & Video',
 'Food',
 'Games',
 'Journalism',
 'Music',
 'Photography',
 'Publishing',
 'Technology',
 'Theater',
 'main_category',
 'goal',
 'pledged',
 'state',
 'backers',
 'country',
 'usd_pledged_real',
 'usd_goal_real',
 'duration_proj(in months)']

### Using these columns minus 'main_category', 'state', 'country' as X

In [16]:
col = ['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'SE',
 'SG',
 'US',
 'Art',
 'Comics',
 'Crafts',
 'Dance',
 'Design',
 'Fashion',
 'Film & Video',
 'Food',
 'Games',
 'Journalism',
 'Music',
 'Photography',
 'Publishing',
 'Technology',
 'Theater',
 'goal',
 'pledged',
 'backers',
 'usd_pledged_real',
 'usd_goal_real',
 'duration_proj(in months)']
X = df[col]
df.head()

Unnamed: 0,AT,AU,BE,CA,CH,DE,DK,ES,FR,GB,...,Theater,main_category,goal,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_proj(in months)
0,0,0,0,0,0,0,0,0,0,1,...,0,Publishing,1000.0,0.0,failed,0,GB,0.0,1533.95,2
1,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,30000.0,2421.0,failed,15,US,2421.0,30000.0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,Music,5000.0,1.0,failed,1,US,1.0,5000.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,Film & Video,19500.0,1283.0,failed,14,US,1283.0,19500.0,1


In [17]:
X.shape

(370222, 43)

In [18]:
y.shape

(370222,)

### Balancing our data using ADASYN

In [19]:
!pip install imblearn
!pip install scipy
from imblearn.over_sampling import ADASYN
ad = ADASYN()
X, y = ad.fit_sample(X, y)
print(X)

[[0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 1.53395e+03
  2.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 2.42100e+03 3.00000e+04
  2.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 2.20000e+02 4.50000e+04
  1.00000e+00]
 ...
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 5.05000e+02 5.00000e+02
  1.00000e+00]]


In [20]:
X.shape

(473552, 43)

In [21]:
y.shape

(473552,)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)
print(y_train)

[1 1 1 ... 1 1 1]


### Normalizing the features

In [23]:
import math 
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler() 
X_train = sc_X.fit_transform(X_train) 
X_test = sc_X.transform(X_test)

### Applying PCA //Instead check for Kernel PCA

#kernel PCA
from sklearn.decomposition import KernelPCA
kernelPCAObj= KernelPCA(n_components=43, kernel='rbf')
X_train= kernelPCAObj.fit_transform(X_train)
X_test= kernelPCAObj.transform(X_test)
components_variance= pcaObj.explained_variance_ratio_
print(components_variance)

In [24]:
#Applying PCA
from sklearn.decomposition import PCA
pcaObj= PCA(n_components=43)
X_train= pcaObj.fit_transform(X_train)
X_test= pcaObj.transform(X_test)
components_variance= pcaObj.explained_variance_ratio_
print(components_variance)

[6.09199838e-02 4.52050339e-02 4.40098610e-02 2.82683406e-02
 2.67273374e-02 2.65221773e-02 2.63220024e-02 2.57615654e-02
 2.55663064e-02 2.54765008e-02 2.54034592e-02 2.50203118e-02
 2.46812565e-02 2.43366615e-02 2.41214519e-02 2.39939072e-02
 2.37773553e-02 2.36525961e-02 2.35328838e-02 2.34659283e-02
 2.33507613e-02 2.33135155e-02 2.32882218e-02 2.32638463e-02
 2.32543867e-02 2.32351491e-02 2.31841473e-02 2.31065225e-02
 2.30935793e-02 2.30365677e-02 2.30308253e-02 2.30085407e-02
 2.29219852e-02 2.28826846e-02 2.26504354e-02 2.25395443e-02
 2.20272975e-02 2.12469281e-02 7.89840166e-03 1.49632240e-03
 1.40541668e-03 5.76536320e-27 6.48420319e-28]


In [25]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.ensemble import RandomForestClassifier
classifierObj = RandomForestClassifier(criterion='entropy')
classifierObj.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of Random Forest model is: ")
classifierObj.score(X_test, y_test)



Accuracy of Random Forest model is: 


0.9831170613761865

In [26]:
#K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
modelAccuracies= cross_val_score(estimator=classifierObj, X=X_train, y=y_train, cv=10)
print(modelAccuracies.mean())
print(modelAccuracies.std())

0.9830060626154872
0.00043533166856599083
