### Loading the dataset

In [None]:
%reset -f
import numpy as np
import pandas as pd

df = pd.read_csv('input/ks-projects-201801.csv')
df.head()

In [None]:
df.shape

### Printing unique values in each columns of our dataset

In [None]:
print(df.nunique())

### Printing the percentage of each categories in our class label "state"

In [None]:
Success_dist = df["state"].value_counts() / len(df["state"]) * 100

print("Success_dist in %: ")
print(Success_dist)

### Dropping all the rows where state labels are undefined, live and suspended since there percentage in the datacet is really less. Also considering the canceled projects as failed project.

In [None]:
def convert_state(df):
    df.state.value_counts()
    df = df[df.state!='undefined']
    df = df[df.state!='live']
    df = df[df.state!='suspended']
    df['state']=df['state'].replace({'canceled':'failed'})
    return df
df = convert_state(df)
df.head()

### Checking for NAN's in each columns


In [None]:
df.isnull().sum()

### Dropping unnecessary columns ID,Name(since it does not contribute to a project's failure or success), category(since we already have the main category), currency(same contribution as the country, so keeping only country), usd_pledged(since it has 232 na's and usd_pledged_real have the similar data as usd_pledged).

In [None]:
df = df.drop('name',axis=1)
df = df.drop('ID',axis=1)
df = df.drop('category',axis=1)
df = df.drop(['usd pledged'], axis=1)
df = df.drop(['currency'], axis=1)
df.head()

### More data cleaning - Removing rows with 'NaN' value and Removing rows with N,0" in country column

In [None]:
df = df.dropna(axis=0)
df = df[df.country!='N,0"']
df.shape

### Calculating the duration of each project

In [None]:
import calendar
import datetime
df['deadline_year'] = df['deadline'].str[0:4]
df['deadline_month']= df['deadline'].str[5:7]
df['launched_year'] = df['launched'].str[0:4]
df['launched_month'] = df['launched'].str[5:7]
df['deadline_year'] = df['deadline_year'].astype(int)
df['deadline_month'] = df['deadline_month'].astype(int)
df['launched_year'] = df['launched_year'].astype(int)
df['launched_month'] = df['launched_month'].astype(int)
df['duration_proj(in months)'] = (df['deadline_year']-df['launched_year'])*12 + (df['deadline_month']-df['launched_month'])
df = df.drop('deadline_year',axis=1)
df = df.drop('deadline_month',axis=1)
df = df.drop('launched_year',axis=1)
df = df.drop('launched_month',axis=1)
df = df.drop('launched',axis=1)
df = df.drop('deadline',axis=1)
df.head()

### Applying one hot encoding to categorical columns main_category and country

In [None]:
onehot = pd.get_dummies(df['main_category'])
df = onehot.join(df)
df.shape

In [None]:
onehot = pd.get_dummies(df['country'])
df = onehot.join(df)
df.shape

### Applying label encoding the class label - 'state'

In [None]:
from sklearn.preprocessing import LabelEncoder
y = df[['state']]
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y.values.ravel())
print(y)

In [None]:
df.head()

### Getting the list of all columns

In [None]:
list(df.columns.values)

### Using these columns minus 'main_category', 'state', 'country' as X

In [None]:
col = ['AT',
 'AU',
 'BE',
 'CA',
 'CH',
 'DE',
 'DK',
 'ES',
 'FR',
 'GB',
 'HK',
 'IE',
 'IT',
 'JP',
 'LU',
 'MX',
 'NL',
 'NO',
 'NZ',
 'SE',
 'SG',
 'US',
 'Art',
 'Comics',
 'Crafts',
 'Dance',
 'Design',
 'Fashion',
 'Film & Video',
 'Food',
 'Games',
 'Journalism',
 'Music',
 'Photography',
 'Publishing',
 'Technology',
 'Theater',
 'goal',
 'pledged',
 'backers',
 'usd_pledged_real',
 'usd_goal_real',
 'duration_proj(in months)']
X = df[col]
df.head()

In [None]:
X.shape

In [None]:
y.shape

### Balancing our data using ADASYN

In [None]:
!pip install imblearn
!pip install scipy
from imblearn.over_sampling import ADASYN
ad = ADASYN()
X, y = ad.fit_sample(X, y)
print(X)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)
print(y_train)

### Normalizing the features

In [None]:
import math 
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler() 
X_train = sc_X.fit_transform(X_train) 
X_test = sc_X.transform(X_test)

In [None]:
X_train.shape[1]

In [None]:
from sklearn import decomposition

svd = decomposition.TruncatedSVD(n_components=38, algorithm='arpack')
svd.fit(X_train)
print(svd.explained_variance_ratio_.sum())

X_train = svd.transform(X_train)
X_test = svd.transform(X_test)

components_variance= svd.explained_variance_ratio_ 
print(components_variance)

In [None]:
#kernel PCA 
from sklearn.decomposition import KernelPCA 
kernelPCAObj= KernelPCA(n_components=43, kernel='rbf') 
X_train= kernelPCAObj.fit_transform(X_train) 
X_test= kernelPCAObj.transform(X_test) 
components_variance= pcaObj.explained_variance_ratio_ 
print(components_variance)

### Applying PCA //Instead check for Kernel PCA

#kernel PCA
from sklearn.decomposition import KernelPCA
kernelPCAObj= KernelPCA(n_components=43, kernel='rbf')
X_train= kernelPCAObj.fit_transform(X_train)
X_test= kernelPCAObj.transform(X_test)
components_variance= pcaObj.explained_variance_ratio_
print(components_variance)

In [None]:
#Applying PCA
from sklearn.decomposition import PCA
pcaObj= PCA(n_components=43)
X_train= pcaObj.fit_transform(X_train)
X_test= pcaObj.transform(X_test)
components_variance= pcaObj.explained_variance_ratio_
print(components_variance)

In [None]:
#Fitting Classifier to Training Set. Create a classifier object here and call it classifierObj
from sklearn.ensemble import RandomForestClassifier
classifierObj = RandomForestClassifier(criterion='entropy')
classifierObj.fit(X_train,y_train)

#Making predictions on the Test Set
y_pred = classifierObj.predict(X_test)

#Evaluating the predictions using a Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Model accuracy -> model got correct results
print("Accuracy of Random Forest model is: ")
classifierObj.score(X_test, y_test)

In [None]:
#K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
classifierKFoldObj = RandomForestClassifier(criterion='entropy')
modelAccuracies= cross_val_score(estimator=classifierKFoldObj, X=X_train, y=y_train, cv=10)
print(modelAccuracies.mean())
print(modelAccuracies.std())