In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import math
import gc

In [3]:
ROOT = "/Users/kkalyan/github/airQo/"

In [4]:
train = pd.read_csv(ROOT+ "data/Train.csv")
test = pd.read_csv(ROOT+ "data/Test.csv")

In [5]:
def replace_nan(x):
    if x==" ":
        return np.nan
    else :
        return float(x)
features=["temp","precip","rel_humidity","wind_dir","wind_spd","atmos_press"]
for feature in features : 
    train[feature]=train[feature].apply(lambda x: [ replace_nan(X) for X in x.replace("nan"," ").split(",")])
    test[feature]=test[feature].apply(lambda x: [ replace_nan(X)  for X in x.replace("nan"," ").split(",")])   

In [6]:
train.shape

(15539, 9)

In [7]:
test.shape

(5035, 8)

In [8]:
def aggregate_features(x,col_name):
    x["max_"+col_name]=x[col_name].apply(np.max)
    x["min_"+col_name]=x[col_name].apply(np.min)
    x["mean_"+col_name]=x[col_name].apply(np.mean)
    x["std_"+col_name]=x[col_name].apply(np.std)
    x["var_"+col_name]=x[col_name].apply(np.var)
    x["median_"+col_name]=x[col_name].apply(np.median)
    x["ptp_"+col_name]=x[col_name].apply(np.ptp)
    return x  
def remove_nan_values(x):
    return [e for e in x if not math.isnan(e)]

In [9]:
data=pd.concat([train,test],sort=False).reset_index(drop=True)
data.columns.tolist()

['ID',
 'location',
 'temp',
 'precip',
 'rel_humidity',
 'wind_dir',
 'wind_spd',
 'atmos_press',
 'target']

In [10]:
data.shape

(20574, 9)

In [11]:
for col_name in tqdm(features):
    data[col_name]=data[col_name].apply(remove_nan_values)

100%|██████████| 6/6 [00:02<00:00,  2.32it/s]


In [12]:
for col_name in tqdm(features):
    data=aggregate_features(data,col_name)

100%|██████████| 6/6 [00:18<00:00,  3.03s/it]


In [13]:
data.drop(features,1,inplace=True)

In [14]:
train=data[data.target.notnull()].reset_index(drop=True)
test=data[data.target.isna()].reset_index(drop=True)

In [15]:
del data  
gc.collect()

66

In [16]:
from sklearn.metrics import mean_squared_error
def metric(y,x):
    return np.sqrt(mean_squared_error(x,y))

In [17]:
from sklearn import svm

In [19]:
train.columns

Index(['ID', 'location', 'target', 'max_temp', 'min_temp', 'mean_temp',
       'std_temp', 'var_temp', 'median_temp', 'ptp_temp', 'max_precip',
       'min_precip', 'mean_precip', 'std_precip', 'var_precip',
       'median_precip', 'ptp_precip', 'max_rel_humidity', 'min_rel_humidity',
       'mean_rel_humidity', 'std_rel_humidity', 'var_rel_humidity',
       'median_rel_humidity', 'ptp_rel_humidity', 'max_wind_dir',
       'min_wind_dir', 'mean_wind_dir', 'std_wind_dir', 'var_wind_dir',
       'median_wind_dir', 'ptp_wind_dir', 'max_wind_spd', 'min_wind_spd',
       'mean_wind_spd', 'std_wind_spd', 'var_wind_spd', 'median_wind_spd',
       'ptp_wind_spd', 'max_atmos_press', 'min_atmos_press',
       'mean_atmos_press', 'std_atmos_press', 'var_atmos_press',
       'median_atmos_press', 'ptp_atmos_press'],
      dtype='object')

In [80]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
clf = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

In [81]:
X = train.drop(["ID", "target", "location"], axis=1)
y = train.target

In [82]:
clf.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [83]:
X_test = test.drop(["location", "target", "ID"], axis=1)

In [84]:
X_test

Unnamed: 0,max_temp,min_temp,mean_temp,std_temp,var_temp,median_temp,ptp_temp,max_precip,min_precip,mean_precip,...,var_wind_spd,median_wind_spd,ptp_wind_spd,max_atmos_press,min_atmos_press,mean_atmos_press,std_atmos_press,var_atmos_press,median_atmos_press,ptp_atmos_press
0,26.858333,17.050000,21.389532,2.731833,7.462909,20.908333,9.808333,5.728,0.0,0.204165,...,0.154279,0.767500,1.744167,88.020833,87.474167,87.748506,0.132764,0.017626,87.752500,0.546667
1,26.991667,17.275000,21.302824,2.289268,5.240748,20.716667,9.716667,15.831,0.0,0.208719,...,0.208605,0.771667,1.889167,88.181667,87.572500,87.936963,0.113756,0.012940,87.928333,0.609167
2,29.850000,17.108333,22.144421,2.943047,8.661528,21.458333,12.741667,1.327,0.0,0.040190,...,0.324600,0.567500,2.890000,90.785000,90.115000,90.521357,0.136316,0.018582,90.536667,0.670000
3,27.325000,18.633333,22.361639,2.480003,6.150415,21.616667,8.691667,3.604,0.0,0.035686,...,0.185370,0.685000,1.575000,88.078333,87.560000,87.754848,0.125583,0.015771,87.726667,0.518333
4,31.091667,16.908333,23.560243,4.136978,17.114585,22.825000,14.183333,0.000,0.0,0.000000,...,0.193861,0.609091,1.761667,90.798333,90.105833,90.475030,0.161193,0.025983,90.487273,0.692500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5030,26.325000,17.125000,21.185537,2.379269,5.660923,20.725000,9.200000,7.731,0.0,0.202587,...,0.123049,0.675000,1.576667,88.085833,87.389167,87.699704,0.153148,0.023454,87.695000,0.696667
5031,32.550000,16.183333,22.931926,4.467736,19.960669,21.212500,16.366667,11.391,0.0,0.249281,...,0.479860,0.879167,3.022500,90.949167,90.199167,90.640585,0.166249,0.027639,90.660833,0.750000
5032,30.925000,16.241667,23.625069,3.973509,15.788770,22.408333,14.683333,0.034,0.0,0.002810,...,0.856878,1.750000,3.704167,88.662500,88.067500,88.375510,0.133175,0.017736,88.390833,0.595000
5033,28.041667,15.583333,21.579282,3.224815,10.399431,21.125000,12.458333,11.625,0.0,0.168190,...,0.178018,0.631667,1.780000,88.630833,88.064167,88.389022,0.141884,0.020131,88.401667,0.566667


In [85]:
preds = clf.predict(X_test)

In [86]:
test.target = preds

In [87]:
test

Unnamed: 0,ID,location,target,max_temp,min_temp,mean_temp,std_temp,var_temp,median_temp,ptp_temp,...,var_wind_spd,median_wind_spd,ptp_wind_spd,max_atmos_press,min_atmos_press,mean_atmos_press,std_atmos_press,var_atmos_press,median_atmos_press,ptp_atmos_press
0,ID_test_0,C,58.385420,26.858333,17.050000,21.389532,2.731833,7.462909,20.908333,9.808333,...,0.154279,0.767500,1.744167,88.020833,87.474167,87.748506,0.132764,0.017626,87.752500,0.546667
1,ID_test_1,C,56.430370,26.991667,17.275000,21.302824,2.289268,5.240748,20.716667,9.716667,...,0.208605,0.771667,1.889167,88.181667,87.572500,87.936963,0.113756,0.012940,87.928333,0.609167
2,ID_test_10,D,45.388643,29.850000,17.108333,22.144421,2.943047,8.661528,21.458333,12.741667,...,0.324600,0.567500,2.890000,90.785000,90.115000,90.521357,0.136316,0.018582,90.536667,0.670000
3,ID_test_100,C,53.032688,27.325000,18.633333,22.361639,2.480003,6.150415,21.616667,8.691667,...,0.185370,0.685000,1.575000,88.078333,87.560000,87.754848,0.125583,0.015771,87.726667,0.518333
4,ID_test_1000,D,59.327341,31.091667,16.908333,23.560243,4.136978,17.114585,22.825000,14.183333,...,0.193861,0.609091,1.761667,90.798333,90.105833,90.475030,0.161193,0.025983,90.487273,0.692500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5030,ID_test_995,C,61.720349,26.325000,17.125000,21.185537,2.379269,5.660923,20.725000,9.200000,...,0.123049,0.675000,1.576667,88.085833,87.389167,87.699704,0.153148,0.023454,87.695000,0.696667
5031,ID_test_996,D,44.559000,32.550000,16.183333,22.931926,4.467736,19.960669,21.212500,16.366667,...,0.479860,0.879167,3.022500,90.949167,90.199167,90.640585,0.166249,0.027639,90.660833,0.750000
5032,ID_test_997,E,38.940972,30.925000,16.241667,23.625069,3.973509,15.788770,22.408333,14.683333,...,0.856878,1.750000,3.704167,88.662500,88.067500,88.375510,0.133175,0.017736,88.390833,0.595000
5033,ID_test_998,A,59.124683,28.041667,15.583333,21.579282,3.224815,10.399431,21.125000,12.458333,...,0.178018,0.631667,1.780000,88.630833,88.064167,88.389022,0.141884,0.020131,88.401667,0.566667


In [88]:
SUB = "/Users/kkalyan/github/airQo/submissions/"

In [89]:
submit = pd.read_csv(ROOT+ "data/sample_sub.csv")

In [90]:
submit

Unnamed: 0,ID,target
0,ID_test_0,0
1,ID_test_1,0
2,ID_test_2,0
3,ID_test_3,0
4,ID_test_4,0
...,...,...
5030,ID_test_5030,0
5031,ID_test_5031,0
5032,ID_test_5032,0
5033,ID_test_5033,0


In [91]:
test

Unnamed: 0,ID,location,target,max_temp,min_temp,mean_temp,std_temp,var_temp,median_temp,ptp_temp,...,var_wind_spd,median_wind_spd,ptp_wind_spd,max_atmos_press,min_atmos_press,mean_atmos_press,std_atmos_press,var_atmos_press,median_atmos_press,ptp_atmos_press
0,ID_test_0,C,58.385420,26.858333,17.050000,21.389532,2.731833,7.462909,20.908333,9.808333,...,0.154279,0.767500,1.744167,88.020833,87.474167,87.748506,0.132764,0.017626,87.752500,0.546667
1,ID_test_1,C,56.430370,26.991667,17.275000,21.302824,2.289268,5.240748,20.716667,9.716667,...,0.208605,0.771667,1.889167,88.181667,87.572500,87.936963,0.113756,0.012940,87.928333,0.609167
2,ID_test_10,D,45.388643,29.850000,17.108333,22.144421,2.943047,8.661528,21.458333,12.741667,...,0.324600,0.567500,2.890000,90.785000,90.115000,90.521357,0.136316,0.018582,90.536667,0.670000
3,ID_test_100,C,53.032688,27.325000,18.633333,22.361639,2.480003,6.150415,21.616667,8.691667,...,0.185370,0.685000,1.575000,88.078333,87.560000,87.754848,0.125583,0.015771,87.726667,0.518333
4,ID_test_1000,D,59.327341,31.091667,16.908333,23.560243,4.136978,17.114585,22.825000,14.183333,...,0.193861,0.609091,1.761667,90.798333,90.105833,90.475030,0.161193,0.025983,90.487273,0.692500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5030,ID_test_995,C,61.720349,26.325000,17.125000,21.185537,2.379269,5.660923,20.725000,9.200000,...,0.123049,0.675000,1.576667,88.085833,87.389167,87.699704,0.153148,0.023454,87.695000,0.696667
5031,ID_test_996,D,44.559000,32.550000,16.183333,22.931926,4.467736,19.960669,21.212500,16.366667,...,0.479860,0.879167,3.022500,90.949167,90.199167,90.640585,0.166249,0.027639,90.660833,0.750000
5032,ID_test_997,E,38.940972,30.925000,16.241667,23.625069,3.973509,15.788770,22.408333,14.683333,...,0.856878,1.750000,3.704167,88.662500,88.067500,88.375510,0.133175,0.017736,88.390833,0.595000
5033,ID_test_998,A,59.124683,28.041667,15.583333,21.579282,3.224815,10.399431,21.125000,12.458333,...,0.178018,0.631667,1.780000,88.630833,88.064167,88.389022,0.141884,0.020131,88.401667,0.566667


In [92]:
submit

Unnamed: 0,ID,target
0,ID_test_0,0
1,ID_test_1,0
2,ID_test_2,0
3,ID_test_3,0
4,ID_test_4,0
...,...,...
5030,ID_test_5030,0
5031,ID_test_5031,0
5032,ID_test_5032,0
5033,ID_test_5033,0


In [93]:
alld = test.join(submit, lsuffix='t', rsuffix='s')

In [94]:
alld.filter(["IDs", "targett"] )

Unnamed: 0,IDs,targett
0,ID_test_0,58.385420
1,ID_test_1,56.430370
2,ID_test_2,45.388643
3,ID_test_3,53.032688
4,ID_test_4,59.327341
...,...,...
5030,ID_test_5030,61.720349
5031,ID_test_5031,44.559000
5032,ID_test_5032,38.940972
5033,ID_test_5033,59.124683


In [95]:
from datetime import datetime

In [96]:
exp_time = str(datetime.now().replace(second=0, microsecond=0)).replace(" ", "_")
exp_name = "svm"
csv_file_name = "{}_{}".format(exp_time, exp_name)

In [97]:
alld.to_csv(SUB+csv_file_name+'.csv', index=False)