In [17]:
import pandas as pd
import numpy as np
import dateutil.parser as time_parser
from sklearn.neural_network import MLPClassifier
from multiprocessing import Pool
from sklearn.metrics import f1_score

def parse(data):
    data["deadline"] = [dateutil.parser.parse(x).timestamp() for x in data["deadline"]]
    return data

In [3]:
url = "../datasets/kickstarter/ks-projects-201801.csv"
used_columns = ["category", "main_category", "currency", "deadline", "state", "backers", "country", "usd_pledged_real", "usd_goal_real"]

data = pd.read_csv(url, usecols=used_columns)[0:10000]
data[100:105]

Unnamed: 0,category,main_category,currency,deadline,state,backers,country,usd_pledged_real,usd_goal_real
100,Webseries,Film & Video,USD,2014-05-11,successful,8,US,756.0,500.0
101,Electronic Music,Music,USD,2012-10-26,successful,14,US,274.05,200.0
102,Radio & Podcasts,Publishing,USD,2016-12-28,successful,19,US,1055.0,1000.0
103,Product Design,Design,USD,2016-11-20,failed,4,US,156.0,500.0
104,Apps,Technology,CAD,2015-05-25,canceled,19,CA,9110.34,40183.24


In [4]:
for col in ["state", "category", "main_category", "currency"]:
    print(col + ": ")
    print(((data[col]).unique()))

print(max(data["backers"]))
print(pd.DataFrame(list(filter(lambda x: x > 0, data["backers"]))).mean())

state: 
['failed' 'canceled' 'successful' 'live' 'undefined' 'suspended']
category: 
['Poetry' 'Narrative Film' 'Music' 'Film & Video' 'Restaurants' 'Food'
 'Drinks' 'Product Design' 'Documentary' 'Nonfiction' 'Indie Rock'
 'Crafts' 'Games' 'Tabletop Games' 'Design' 'Comic Books' 'Art Books'
 'Fashion' 'Childrenswear' 'Theater' 'Comics' 'DIY' 'Webseries'
 'Animation' 'Food Trucks' 'Public Art' 'Illustration' 'Photography' 'Pop'
 'People' 'Art' 'Family' 'Fiction' 'Accessories' 'Rock' 'Hardware'
 'Software' 'Weaving' 'Gadgets' 'Web' 'Jazz' 'Ready-to-wear' 'Festivals'
 'Video Games' 'Anthologies' 'Publishing' 'Shorts' 'Electronic Music'
 'Radio & Podcasts' 'Apps' 'Cookbooks' 'Apparel' 'Metal' 'Comedy'
 'Hip-Hop' 'Periodicals' 'Dance' 'Technology' 'Painting' 'World Music'
 'Photobooks' 'Drama' 'Architecture' 'Young Adult' 'Latin' 'Mobile Games'
 'Flight' 'Fine Art' 'Action' 'Playing Cards' 'Makerspaces' 'Punk'
 'Thrillers' "Children's Books" 'Audio' 'Performance Art' 'Ceramics'
 'Vegan' 'G

In [5]:
print(max(data["pledged"]))
print(data["pledged"].mean())
print(pd.DataFrame(list(filter(lambda x: x > 0, data["pledged"]))).mean())

KeyError: 'pledged'

In [11]:
def cleansing(data):
    for col in data.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']):
        data[col] = pd.to_numeric(data[col])
        data[col] = data[col].replace(np.nan, data[col].mean())
    for col in data.select_dtypes(exclude=["number"]).columns:
        data[col] = data[col].astype('category')    
        data[col] = data[col].cat.codes
    return data
data = cleansing(data)

In [12]:
X = data.loc[:, data.columns != "usd_goal_real"]
Y = data["usd_goal_real"]
amount = int(len(X) * 0.8)
Xtrain = X[0:amount]
Ytrain = Y[0:amount].astype('int')
Xtest = X[amount::]
Ytest = Y[amount::].astype('int')
Xtrain

Unnamed: 0,category,main_category,currency,deadline,state,backers,country,usd_pledged_real
0,106,12,5,1802,1,0,9,0.00
1,91,6,13,2533,1,15,22,2421.00
2,91,6,13,869,1,3,22,220.00
3,88,10,13,569,1,1,22,1.00
4,54,6,13,1761,0,14,22,1283.00
5,120,7,13,1976,3,224,22,52375.00
6,57,7,13,1517,3,16,22,1205.00
7,40,7,13,1961,1,40,22,453.00
8,111,4,13,1311,0,58,22,8233.00
9,38,6,13,1384,0,43,22,6240.57


In [13]:
clfs = [MLPClassifier(solver='lbfgs', alpha=1e-2,
                     hidden_layer_sizes=(layer), random_state=1) for layer in [1, 5, 10, 15, 30, 50, 100, 300, 500]]
for clf in clfs:
    clf.fit(Xtrain, Ytrain)

In [None]:
[(1/len(Xtest))*(sum(clf.predict(Xtest)- Ytest)**2.)**0.5 for clf in clfs]

In [33]:
clfs = []
for alpha in 10.** -np.arange(1, 7):
    for layer in [1, 5, 10, 15]:
        clfs = clfs + [MLPClassifier(solver='adam', alpha=alpha,
                     hidden_layer_sizes=(layer), random_state=1)]

In [34]:
def multicore_train(clf):
    print("Training " + str(clf.get_params()["alpha"]) + " " + str(clf.get_params()["hidden_layer_sizes"]))
    clf.fit(Xtrain, Ytrain)
    
for clf in clfs:
    multicore_train(clf)

Training 0.1 1
Training 0.1 5
Training 0.1 10
Training 0.1 15
Training 0.01 1
Training 0.01 5
Training 0.01 10
Training 0.01 15
Training 0.001 1
Training 0.001 5
Training 0.001 10
Training 0.001 15
Training 0.0001 1
Training 0.0001 5
Training 0.0001 10
Training 0.0001 15
Training 1e-05 1
Training 1e-05 5
Training 1e-05 10
Training 1e-05 15
Training 1e-06 1
Training 1e-06 5
Training 1e-06 10
Training 1e-06 15


In [35]:
for clf in clfs:
    yPred = clf.predict(Xtest)
    print(pd.DataFrame(yPred)[0].unique())
    #f1 = f1_score(Ytest, yPred, average="micro")
    #print("F1 " + str(f1))

[5000]
[  8000   5000  18000  40000   6500  20000  10000   1000 250000  12500
   3500   3928   4000   7000   5262   1500   3829      1  15000   1004
  38286]
[ 11867   5000  26375  10000  18000   6500   1200  10652   8500   3500
   5480  19663   3000  25000  10618   2500 150000  19000   2000   1000
 650000   5500    444  36500   7000  30000   1363  50000  11000    500
  10891   2069   4500  20000  16000  32068    642   2700   5707    751
   7200   8520  21609  24051   1064  47392   6238   5350   4142  12500
 122081    666  29135  31000]
[ 10000   1000  20000  90000 120000   5000  32978   4000 200000   2000
  27000   5500 240280  75000 125000  13000   3000 100000  15000  56000
  36000   9500   2959  35430   7500  17000   2600   8000]
[5000]
[ 3000  5000  6000 10000 20000 15305  2500 25000  4500  1000 15000 60000
 40000   500  8000]
[ 15000   1000   5000    259 200000  14650  20877   2000   3500   1064
   1750  25000    500  25778   2300  21609   1500   2500  13000  19000
   2875  24887 

In [36]:
[(1/len(Xtest))*(sum(clf.predict(Xtest)- Ytest)**2.)**0.5 for clf in clfs]

[22661.829,
 3784.71,
 14411.5745,
 10283.238,
 22661.829,
 19097.774,
 4537.647,
 16276.4535,
 22661.829,
 8945.041000000001,
 16743.1915,
 173.781,
 22661.829,
 17322.929,
 27436.239,
 15114.014000000001,
 22661.829,
 39761.6955,
 14561.526,
 12014.4645,
 22661.829,
 35819.425,
 16868.4675,
 15525.391]

In [40]:
(1/len(Xtest))*(sum(clfs[11].predict(Xtest)- Ytest)**2.)**0.5

173.781