In [62]:
# Import Libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
import collections
import operator

from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import svm 
from sklearn import tree
from sklearn import neural_network
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier

# import random
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold

In [63]:
### Load Dataset

# Full Data is saved to 'proj'
proj = pd.read_csv('ks-projects-201801.csv')


In [64]:
### Remove invalid data rows

# Remove live, cancelled, and invalid projects
# Slicing just successful and failed projects
proj = proj[(proj['state'] == 'failed') | (proj['state'] == 'successful')]

# Remove outlier Project goals
outlier_cost = 10000
proj = proj[proj['usd_goal_real'] < outlier_cost]


In [65]:
### Determine the time durations

#Create array to hold durations
duration = np.zeros((proj.shape[0],),dtype=datetime)
j = 0

# Iterate and save to duration
for i in proj.index:
    deadline = datetime.strptime(proj['deadline'][i], '%Y-%m-%d')
    launched = datetime.strptime(proj['launched'][i], '%Y-%m-%d %X')
    duration[j] = (deadline - launched).total_seconds()/3600/24
    j += 1


In [66]:
# Save imporant classifier input to data
# we dont care about the other columns
data = proj[['state', 'usd_goal_real', 'main_category', 'country']]
data.head()

Unnamed: 0,state,usd_goal_real,main_category,country
0,failed,1533.95,Publishing,GB
3,failed,5000.0,Music,US
6,successful,1000.0,Food,US
10,failed,2406.39,Publishing,CA
12,failed,5000.0,Crafts,US


In [67]:
# Modifing value of dependent variable from categorical to numerical
data.loc[data['state'] == 'failed', 'state'] = 0
data.loc[data['state'] == 'successful', 'state'] = 1
  
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,state,usd_goal_real,main_category,country
0,0,1533.95,Publishing,GB
3,0,5000.0,Music,US
6,1,1000.0,Food,US
10,0,2406.39,Publishing,CA
12,0,5000.0,Crafts,US


In [68]:
# Scaling goal amount since it behaves differently in each category
for cat in data['main_category'].unique():
    scaler = preprocessing.StandardScaler()
    new_values = scaler.fit_transform(data[data['main_category'] == cat][['usd_goal_real']])
    data.loc[data['main_category'] == cat, 'usd_goal_real'] = new_values.transpose()[0]

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,state,usd_goal_real,main_category,country
0,0,-0.754804,Publishing,GB
3,0,0.778541,Music,US
6,1,-0.935894,Food,US
10,0,-0.400649,Publishing,CA
12,0,1.178978,Crafts,US


In [69]:
def Preprocess(data, duration, uniqueScale=False):
    
    ### unique Scale will scale the duration based on its category
    if uniqueScale:    
        # Add the scaled durations to the data set
        data['duration'] = pd.Series(duration, index=data.index)
        # Scale each duration based on its category
        for cat in data['main_category'].unique():
            scaler = preprocessing.StandardScaler()
            new_values = scaler.fit_transform(data[data['main_category'] == cat][['duration']])
            data.loc[data['main_category'] == cat, 'duration'] = new_values.transpose()[0]
        # Get dummy Categories    
        dummy_cat = pd.get_dummies(data['main_category'])
        dummy_country = pd.get_dummies(data['country'])
        # Add dummy categories
        data = pd.concat([data, dummy_cat, dummy_country], axis=1)
        #data = pd.concat([data, dummy_cat], axis=1)
        data = data.drop(['main_category', 'country'], axis=1)
    
    ### else scale based on 
    else:
        # Use one hot encoding (binary dimensions)
        data = pd.get_dummies(data)
        # Normalize the time durations
        scaler = preprocessing.StandardScaler()
        scaled_duration = scaler.fit_transform(duration.reshape(duration.shape[0],1)).reshape((duration.shape[0],))
        # Add the scaled durations to the data set
        data['duration'] = pd.Series(scaled_duration, index=data.index)
    
    return data

data = Preprocess(data, duration, False)

data.head()



Unnamed: 0,state,usd_goal_real,main_category_Art,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,...,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US,duration
0,0,-0.754804,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.048351
3,0,0.778541,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-0.173779
6,1,-0.935894,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,-0.998705
10,0,-0.400649,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.222037
12,0,1.178978,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-0.218384


In [70]:
### Train Test Split

# Decrease this variable to work with smaller data sets
tsz = 300000

# Define input and output data
X = data.drop('state', axis=1)
Y = data['state']

# Generate training and testing data
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X[:tsz], Y[:tsz], test_size=0.3, random_state=1)

# View the size of the data
print(Xtrain.shape)
print(Xtest.shape)

(146155, 40)
(62639, 40)


In [None]:
### Create Classifier

def Classifier(X,Y, ensemble=False, C=1.0):
    #clf = tree.DecisionTreeClassifier(random_state=0)
    #clf = svm.SVC(C = C, probability=True, kernel='rbf', verbose=True, degree=3)
    clf = linear_model.LogisticRegression(solver='liblinear', n_jobs=-1)
    #clf = neural_network.MLPClassifier(hidden_layer_sizes=(5), early_stopping=False, verbose=True, activation="relu")
    if not ensemble:
        clf.fit(X,Y)
        return clf
    model = BaggingClassifier(base_estimator=clf, n_jobs=-1)
    model.fit(X,Y)
    return model

# Logistic Regression
def LRClassifier(X, Y, ensemble=False):
    clf = linear_model.LogisticRegression(solver='sag', n_jobs=-1, verbose=True)
    if not ensemble:
        clf.fit(X,Y)
        return clf
    model = BaggingClassifier(base_estimator=clf, n_jobs=-1)
    model.fit(X,Y)
    return model

def SVMClassifier(X, Y, ensemble=False):
    clf = svm.SVC(C = 1.0, probability=True, kernel='rbf', verbose=True, degree=3)
    if not ensemble:
        clf.fit(X,Y)
        return clf
    model = BaggingClassifier(base_estimator=clf, n_jobs=-1)
    model.fit(X,Y)
    return model
    

clf = SVMClassifier(Xtrain, Ytrain, True)
print('Classifier has been fitted.')

In [None]:
print("Model's accuracy is {0}%".format(round(clf.score(Xtest, Ytest)*100, 2)))

LR No-Bagging with unique standardization and no country results in an accuracy of 60.7%
LR No-Bagging with unique standardization results in an accuracy of 61.27%
LR Bagging with unique standardization results in an accuracy of 61.23%
LR No-Bagging with normal standardization results in an accuracy of 61.24%
LR Bagging with normal standardization results in an accuracy of 61.25%


SVM 'rbf' kernel no bagging with unique standardization and no country is an accuracy of 60.3%
SVM 'rbf' kernel bagging with unique standardization is an accuracy of 60.3%
SVM 'rbf' kernel bagging with Normal standardization is an accuracy of 60.3%