In [1]:
import pandas as pd
import tensorflow as tf
import io
import requests

# read data and fix errors
the data read has some minor errors like string 'false' instead of boolean False
let's fix them

In [2]:

#read training data set

def read_train_data():
    return pd.read_csv('./train.csv')
# normalize column values
def fix_errors(data):
    data['disable_communication'] = data['disable_communication'].replace(['false'],False)
    data['disable_communication'] = data['disable_communication'].replace(['true'],True)
    print('replaced \'false\' with False and \'true\' with True') 
    return data


In [3]:
#exchange rates from yahoo

def read_exchange_rates_json():
    url = 'https://finance.yahoo.com/webservice/v1/symbols/allcurrencies/quote?format=json'
    try:
        return pd.read_json(io.StringIO(requests.get(url).content.decode('utf-8')))['list']['resources']
    except:
        print('couldn\'t fetch data from yahoo, loading local data')
        return pd.read_json('./currencyrates.json')['list']['resources']
        

# make dictionary of required field
def get_exchange_rates_dataframe(data):
    rates = []
    for rate in data:
        dt = rate['resource']['fields']
        name = dt['name']
        if(name[0:3]!='USD'):
            continue
            
        if(len(name) < 6):
            rates = rates + [{'to':name,'price' : float(dt['price'])  }]
            continue
            
        _from,_to = name.split('/')
        rates = rates + [{'to':_to,'price' : float(dt['price'])  }]
    
    return pd.DataFrame.from_records(rates)

def get_price(code,data):
    return data[data.to == code ].price
# data frame from dictionary list

# data normalization
the goal price is represented using many currencies. we convert all the currrency to US Dollars by grabbing exchange rate from Yahoo and diving all currencies  by it's exhange rate.

In [313]:
def normalize_price(data,ex_data):
    for index,dt in data.iterrows():
        price = float(dt['goal']) / get_price(dt['currency'],ex_data) 
        data.set_value(index,'goal',price)
    print('price normalization complete')
    return data

#modification to project name and description
def get_ratio(name):
    sc = 1
    ab = 1
    for c in name:
        if c.isalpha():
            ab += 1
        else:
            sc += 1
    return sc/ab
            

def normalize_name_and_desc(data):
    for index,dt in data.iterrows():
        name_ratio = get_ratio(str(dt['name']))
        desc_ratio = get_ratio(str(dt['desc']))
        data.set_value(index,'name',name_ratio)
        data.set_value(index,'desc',desc_ratio)
    print('name and desc normalization complete')
    return data
    
def normalize_deadline(data):
    for index,dt in data.iterrows():
        deadline = dt['deadline'] - dt['launched_at']
        data.set_value(index,'deadline',deadline)
    print('deadline normalization complete')
    return data
    
def normalize(data,ex_data):
    data = normalize_deadline(data)
    data = normalize_name_and_desc(data)
    data = normalize_price(data,ex_data)
    return data
    
    


In [314]:
'''
work flow : 
 1. Read training data
 2. fix_errors
 3. Read exchange rates json
 4. Get dataframe of exchange rates
 5. Normalize
 
'''
train_data = read_train_data()
print('read train data')
train_data = fix_errors(train_data)
print('errors fixed')
ex_rates = read_exchange_rates_json()
print('read exchange rates json')
ex_rates = get_exchange_rates_dataframe(ex_rates)
print('dataframe is generated from json data')
train_data = normalize(train_data,ex_rates)



read train data
replaced 'false' with False and 'true' with True
errors fixed
read exchange rates json
dataframe is generated from json data
deadline normalization complete
name and desc normalization complete
price normalization complete


In [315]:
import numpy as np

In [418]:
dt = train_data.loc[:,['disable_communication','goal','deadline','name','desc']].as_matrix().astype(float)


In [420]:
#linear regression

col = 2
day = 60 * 60 * 60 * 24
# converts seconds into days
for i in range(len(dt)):
    dt[i][col] = dt[i][col]/day


In [421]:
yt = train_data.loc[:,['final_status']].as_matrix().astype(float)

In [422]:
import tensorflow.contrib.learn as skflow
from sklearn import datasets, metrics


In [423]:
#training data
#x
inputY = np.array([yt.T[0],(yt.T == 0.).astype(float)[0]]).T
#y
inputX = dt


In [424]:

inputY

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [425]:
inputX

array([[  0.00000000e+00,   2.00000000e+01,   2.72114281e-08,
          1.66666667e-01,   2.95238095e-01],
       [  0.00000000e+00,   3.00000000e+02,   5.40825998e-08,
          1.81818182e-01,   3.23809524e-01],
       [  0.00000000e+00,   3.00000000e+01,   3.21190230e-08,
          2.50000000e-01,   2.83168317e+00],
       ..., 
       [  0.00000000e+00,   3.20000000e+02,   9.64506173e-08,
          1.87500000e-01,   2.42424242e-01],
       [  0.00000000e+00,   3.50000000e+04,   9.95336508e-08,
          2.30769231e-01,   2.56756757e-01],
       [  0.00000000e+00,   2.50000000e+04,   1.44675926e-07,
          1.72413793e-01,   2.14285714e-01]])

In [507]:
learning_rate = 0.000000001
epoch = 500000
display_interval = 10000
sample_size = length

In [508]:
x_ = tf.placeholder(tf.float32,[None,5],name="x")

weights = tf.Variable(tf.zeros([5,2]))

biases = tf.Variable(tf.zeros([2]))

#predict

predictions = tf.add(tf.matmul(x_,weights),biases)

#activation function

predicted_answer = tf.nn.softmax(predictions)

y_ = tf.placeholder(tf.float32,[None,2],name="y_")




In [509]:
cost_function = tf.reduce_sum(tf.pow(y_ - predicted_answer,2))/(2*sample_size)

#gradient descent

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)



In [510]:
init = tf.global_variables_initializer()
session = tf.Session()
session.run(init)

In [None]:
for i in range(epoch):
    session.run(optimizer,feed_dict = {x_:inputX,y_:inputY})
    if i % display_interval == 0:
        values = session.run(cost_function,feed_dict = {x_:inputX,y_:inputY})
        print( "Training step:", '%04d' % (i), "cost=", "{:.9f}".format(values))

Training step: 0000 cost= 0.230077952


In [368]:
values = session.run(cost_function,feed_dict = {x_:inputX,y_:inputY})
print( "Training step:", '%04d' % (i), "cost=", "{:.9f}".format(values))

Training step: 261948 cost= 0.111111172


In [376]:
print( "Optimization Finished!")
training_cost = session.run(cost_function, feed_dict={x_: inputX, y_: inputY})
print("Training cost=", training_cost, "W=", session.run(weights), "b=", session.run(biases), '\n')

Optimization Finished!
Training cost= 0.111111 W= [[ -8.80084990e-05   8.80084990e-05]
 [ -4.76963323e-04   4.77408670e-04]
 [ -3.28088063e-03   3.28088086e-03]
 [  3.97891849e-02  -3.97891738e-02]
 [ -3.77297425e-03   3.77297471e-03]
 [ -2.77799810e-03   2.77799810e-03]] b= [-0.00594101  0.00594101] 



In [385]:
guess = session.run(predictions, feed_dict={x_: inputX })


In [386]:
count = 0
for e in guess:
    if e[0] > 0:
        count += 1
        

In [387]:
acount = 0

for e in guess:
    if e[0] > 0:
        acount += 1


In [388]:
print(count,acount)

37956 37956


In [407]:
test = pd.read_csv('./test.csv')

In [412]:
def norm_pr(data,ex_data):
    count = 0
    for index,dt in data.iterrows():
        count += 1
        price = float(dt['goal']) / np.array(get_price(dt['currency'],ex_data))[0]
     
        data.set_value(index,'goal',price)
    print('price normalization complete',count)
    return data
test = norm_pr(test,ex_rates)

price normalization complete 63465


In [415]:
test = fix_errors(test)
print('errors fixed',test.size/12)


replaced 'false' with False and 'true' with True
errors fixed 63465.0


In [416]:
test = normalize_deadline(test)
test = normalize_name_and_desc(test)

deadline normalization complete
name and desc normalization complete


In [417]:
test

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at
0,kkst917493670,0.222222,0.260417,7000.000000,brathair,False,US,USD,3459600,1449619185,1446002581,1446159585
1,kkst1664901914,0.125,0.222222,35000.000000,the-screenwriter,False,US,USD,3024000,1453435620,1450297323,1450411620
2,kkst925125077,0.184211,0.242718,49500.000000,the-hornets-nest-the-fairmont-heights-story,False,US,USD,3108572,1451780700,1448581356,1448672128
3,kkst1427645275,0.2,0.195652,40000.000000,brothers-season-2-groundbreaking-transgender-male,False,US,USD,2592000,1445021530,1440966830,1442429518
4,kkst1714249266,0.176471,0.218182,20000.000000,blackdom-the-movie,False,US,USD,3733950,1462068844,1455765276,1458334890
5,kkst994744324,0.22,0.23301,10000.000000,heros-battle-the-movie,False,US,USD,2592000,1461777994,1458171626,1459185994
6,kkst366471810,0.413793,0.209524,1250.635260,limbo-film-project-213-lives-of-julia,False,IT,EUR,2592000,1450087423,1446994619,1447495423
7,kkst1686645245,0.333333,0.333333,486.587192,traffic-a-short-film,False,GB,GBP,5176616,1456516801,1450964837,1451340184
8,kkst1009612119,0.125,0.212121,6000.000000,modern-gangsters,False,US,USD,2587376,1444337941,1441745957,1441750564
9,kkst774947236,0.234043,0.244681,8000.000000,kiss-me-goodbye-a-new-voice-in-indie-filmmaking,False,US,USD,3024000,1444144223,1438886415,1441120222
