#### Import Libraries ####

In [1]:
%pylab inline
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import grid_search 
import math

Populating the interactive namespace from numpy and matplotlib


#### Read Data ####


In [2]:
train_data = pd.read_csv('data/train.csv',parse_dates=[0])
test_data = pd.read_csv('data/test.csv',parse_dates=[0])

####Measure Error as Kaggle Does####

In [3]:
def error(predictions, actual):
    err = 0
    for p, a in zip(predictions, actual):
        err += (np.log1p(p)-np.log1p(a))**2
    J = (err/len(predictions))**.5
    if math.isnan(J):
        print("debug")
        print(len(predictions))
        print(len(actual))
        print(predictions)
        print(actual)
    return J

#### Generate indexes for training/test sets ####

In [4]:
kfold_indexes = cross_validation.KFold(len(train_data),5,shuffle=True) 

#### Create different features ####

In [5]:
hours = ['h' + str(i) for i in range(0,24)]
months = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec']    
days = ['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']

set_2 = months + days + hours + ['holiday', 'workingday', 'weather_norm', 'humidity_norm', 'temp_norm', 'windspeed_norm', 'year_norm']
set_1 = ['season','holiday','workingday','weather','humidity','temp','windspeed','hour','month','year', 'day_of_week']


for data_set in [train_data, test_data]:

    data_set['day'] = [e.day for e in data_set['datetime']]
    data_set['hour'] = [e.hour for e in data_set['datetime']]
    data_set['month'] = [e.month for e in data_set['datetime']]
    data_set['year'] = [e.year - 2011 for e in data_set['datetime']]
    data_set['day_of_week'] = [e.weekday() for e in data_set['datetime']]

    def norm(fld_name):
        min_val = min(data_set[fld_name])
        max_val = max(data_set[fld_name])
        diff = max_val - min_val
        data_set[fld_name + '_norm'] = [(e-min_val)/diff for e in data_set[fld_name]]

    for val in set_1:
        norm(val)



    for i, month in enumerate(months):
        data_set[month] = [1 if k == i else 0 for k in data_set['month']]


    for i, day in enumerate(days):
        data_set[day] = [1 if k == i else 0 for k in data_set['day_of_week']]


    for i, hour in enumerate(hours):
        data_set[hour] = [1 if k == i else 0 for k in data_set['hour']]
    

# Might not make sense since test data won't have these features
# start_i = 0
# end_i = -1
# prev_date = train_data['datetime'][0]
# temps = 0

# train_data.insert(train_data.shape[1], 'daily_temp', 0)
# for i, row in train_data.iterrows():
#     date = row['datetime']
#     day, year = date.dayofyear, date.year
#     if year == prev_date.year and day == prev_date.dayofyear:
#         temps += row['temp']
#         end_i = i
#     else:
#         temp = temps / (end_i+1-start_i)
#         train_data.ix[start_i:end_i,'daily_temp'] = temp
            
#         start_i = end_i
#         temps = 0
        
    
#     prev_date = date
    

In [6]:
#### Create different feature sets ####
set_1_norm = [e + '_norm' for e in set_1]
feature_sets = [
    set_2
]

train_data[:10]


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,h14,h15,h16,h17,h18,h19,h20,h21,h22,h23
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,...,0,0,0,0,0,0,0,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,...,0,0,0,0,0,0,0,0,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,...,0,0,0,0,0,0,0,0,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,...,0,0,0,0,0,0,0,0,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,...,0,0,0,0,0,0,0,0,0,0
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,0,...,0,0,0,0,0,0,0,0,0,0
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0,2,...,0,0,0,0,0,0,0,0,0,0
7,2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,0.0,1,...,0,0,0,0,0,0,0,0,0,0
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0,1,...,0,0,0,0,0,0,0,0,0,0
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0,8,...,0,0,0,0,0,0,0,0,0,0


####Actual Machine Learning Part####

In [18]:
grid_params = [
        {'kernel': ['rbf'], 
         'C': [10000, 100000], 
         'gamma': [0]
        }
]

model = svm.SVR(cache_size=500, C=10000, kernel='rbf')

for i, feature_set in enumerate(feature_sets):
    for j, (train_indexes, test_indexes) in enumerate(kfold_indexes):
        #model = grid_search.GridSearchCV(model, grid_params, n_jobs=4)
        
        fit_data = train_data[feature_set].iloc[train_indexes]
        fit_answers = train_data['count'].iloc[train_indexes]
        check_data = train_data[feature_set].iloc[test_indexes]
        check_answers = train_data['count'].iloc[test_indexes]
        
        model.fit(fit_data, fit_answers)

        # svr.fit(fit_data, fit_answers)
#         print("Best params")
#         print(gs.best_params_)
        
        predictions = model.predict(check_data)
        predictions = [round(max(0, e)) for e in predictions]
        error_val = error(predictions, check_answers)
        print('Feature Set {}: KFold Set {}: {}'.format(i, j, error_val))
    

Feature Set 0: KFold Set 0: 0.6088609332591111
Feature Set 0: KFold Set 1: 0.6791199379685994
Feature Set 0: KFold Set 2: 0.664732494208971
Feature Set 0: KFold Set 3: 0.7014256226037582
Feature Set 0: KFold Set 4: 0.6355201173721106


In [12]:
feature_set = feature_sets[0]
model = svm.SVR(cache_size=500, C=10000, kernel='rbf')
model.fit(train_data[feature_set], train_data['count'])

predictions = model.predict(test_data[feature_set])
predictions = [int(round(max(0, e)))  for e in predictions]

In [15]:
predictions = [int(round(max(0, e)))  for e in predictions]
tdf = pd.DataFrame(data=predictions, index=test_data['datetime'], columns=['count'])
tdf.to_csv('data/svr_results.csv')