In [13]:
import pandas as pd
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
import gc

In [2]:
dataset_train_validate = pd.read_csv("train.csv")
dataset_test = pd.read_csv("test.csv")

# train: 6300 rest for validation
# split after preprocessing
# train = dataset_train_validate[]

In [30]:
# preprocessing 
def preprocessing(df):   
    # convert all cost to USD
    # array(['CHF', 'EUR', 'GBP', 'JPY', 'USD'], dtype=object)
    
    # CHF to USD
    df['sold'][df['currency']=='CHF'] = df['sold']*1.01
    # EUR to USD
    df['sold'][df['currency']=='EUR'] = df['sold']*1.19
    # GBP to USD
    df['sold'][df['currency']=='GBP'] = df['sold']*1.34
    # JPY to USD
    df['sold'][df['currency']=='JPY'] = df['sold']*0.01
    
    # remove desk_id, office_id, pf_id
    # think about indicator_code, hedge_value, status
    df = df.drop(["portfolio_id","desk_id", "office_id","indicator_code","hedge_value","status", "currency"], axis=1) 
    
    # vectorise type and pf_category
    vec = 1
    for i in np.unique(df['type']):
        df['type'][df['type'] == i] = vec
        vec = vec + 1
        
    vec = 1
    for i in np.unique(df['pf_category']):
        df['pf_category'][df['pf_category'] == i] = vec
        vec = vec + 1
        
    vec = 26
    for i in np.unique(df['country_code']):
        df['country_code'][df['country_code'] == i] = vec
        vec = vec - 1
    
    # how to change datatypes in df
    df['type'] = pd.to_numeric(df['type'],errors='coerce')
    df['country_code'] = pd.to_numeric(df['country_code'],errors='coerce')
    df['pf_category'] = pd.to_numeric(df['pf_category'],errors='coerce')
    
    # mean sd normalization for sold col...
    mean = np.mean(df['sold'])
    sd = np.sqrt(np.var(df['sold']))
    df['sold'] = (df['sold']-mean)/sd
    
    # normalization for bought col...
    mean = np.mean(df['bought'])
    sd = np.sqrt(np.var(df['bought']))
    df['bought'] = (df['bought']-mean)/sd
    
    # normalization for libor_rate
    mean = np.mean(df['libor_rate'])
    sd = np.sqrt(np.var(df['libor_rate']))
    df['libor_rate'] = (df['libor_rate']-mean)/sd
    return df

In [31]:
dataset_train_validate

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,2,20040720,0.028846,24,0.02074,USD,0.915120,0.041083,20040720,,20040812,2,,,0.02496
1,PF00001003,DSK00001002,OFF00001001,1,20040709,2.187320,23,0.02074,GBP,2.929944,0.384550,20040723,,20040812,3,,,0.05496
2,PF00001005,DSK00001004,OFF00001001,1,20040723,-0.245460,24,0.02074,USD,0.915120,-0.236135,20040723,,20040817,1,,,0.02496
3,PF00001006,DSK00001005,OFF00001001,1,20040609,0.309749,24,0.02074,USD,0.915120,0.324886,20040723,,20040713,1,,,0.02496
4,PF00001007,DSK00001005,OFF00001002,2,20040609,0.186688,24,0.02074,USD,0.915120,0.200526,20040723,,20040713,2,,,0.02496
5,PF00001008,DSK00001006,OFF00001001,1,20040707,-0.285090,24,0.02074,USD,0.915120,-0.276232,20040726,,20040810,1,,,0.02490
6,PF00001010,DSK00001009,OFF00001001,1,20040706,-0.224349,24,0.02074,USD,0.915120,-0.214850,20040726,,20040809,1,,,0.02493
7,PF00001011,DSK00001009,OFF00001002,2,20040706,0.152865,24,0.02074,USD,0.915120,0.166347,20040726,,20040809,2,,,0.02493
8,PF00001012,DSK00001010,OFF00001001,1,20040419,-0.109885,24,0.02074,USD,0.915120,-0.100579,20040726,,20040720,1,,,0.02460
9,PF00001016,DSK00001014,OFF00001001,1,20040414,0.554009,24,0.02074,USD,0.915120,0.568249,20040727,,20040713,1,,,0.02466


In [32]:
processed_data = preprocessing(dataset_train_validate)
processed_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.

Unnamed: 0,pf_category,start_date,sold,country_code,euribor_rate,libor_rate,bought,creation_date,sell_date,type,return
0,2,20040720,0.049512,24,0.02074,0.915120,0.041083,20040720,20040812,2,0.02496
1,1,20040709,2.685939,25,0.02074,2.929944,0.384550,20040723,20040812,3,0.05496
2,1,20040723,-0.199678,24,0.02074,0.915120,-0.236135,20040723,20040817,1,0.02496
3,1,20040609,0.304693,24,0.02074,0.915120,0.324886,20040723,20040713,1,0.02496
4,2,20040609,0.192901,24,0.02074,0.915120,0.200526,20040723,20040713,2,0.02496
5,1,20040707,-0.235679,24,0.02074,0.915120,-0.276232,20040726,20040810,1,0.02490
6,1,20040706,-0.180500,24,0.02074,0.915120,-0.214850,20040726,20040809,1,0.02493
7,2,20040706,0.162175,24,0.02074,0.915120,0.166347,20040726,20040809,2,0.02493
8,1,20040419,-0.076517,24,0.02074,0.915120,-0.100579,20040726,20040720,1,0.02460
9,1,20040414,0.526588,24,0.02074,0.915120,0.568249,20040727,20040713,1,0.02466


In [33]:
train_set = processed_data[:6300]
test_set = processed_data[6300:]

In [19]:
plt.hist(processed_data['euribor_rate'])
plt.show()

In [23]:
plt.plot(processed_data['bought'])
plt.show()

In [52]:
y_train = train_set['return'].values
x_train = train_set.drop(['return'], axis=1)

y_valid = test_set['return'].values
x_valid = test_set.drop(['return'], axis=1)

print x_train.shape, y_train.shape,x_valid.shape, y_valid.shape

(6300, 10) (6300,) (3066, 10) (3066,)


In [53]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

30

In [54]:
print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Training ...
[0]	train-mae:0.47181	valid-mae:0.485401
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.385577	valid-mae:0.396995
[20]	train-mae:0.315119	valid-mae:0.324718
[30]	train-mae:0.257551	valid-mae:0.265484
[40]	train-mae:0.210509	valid-mae:0.217048
[50]	train-mae:0.172071	valid-mae:0.177478
[60]	train-mae:0.140675	valid-mae:0.145156
[70]	train-mae:0.115022	valid-mae:0.118641
[80]	train-mae:0.094054	valid-mae:0.096965
[90]	train-mae:0.076928	valid-mae:0.079248
[100]	train-mae:0.062938	valid-mae:0.064746
[110]	train-mae:0.051515	valid-mae:0.052851
[120]	train-mae:0.042185	valid-mae:0.04309
[130]	train-mae:0.034562	valid-mae:0.035155
[140]	train-mae:0.028333	valid-mae:0.028608
[150]	train-mae:0.023262	valid-mae:0.023259
[160]	train-mae:0.019136	valid-mae:0.018953
[170]	train-mae:0.015781	valid-mae:0.01542
[180]	train-mae:0.013041	valid-mae:0.012537
[190]	train-mae:0.010

In [55]:
print('Building test set ...')

submission = {}

submission['portfolio_id'] = dataset_test['portfolio_id'].values
x_test = preprocessing(dataset_test)

d_test = xgb.DMatrix(x_test)
p_test = clf.predict(d_test)

submission['return'] = p_test

print submission

Building test set ...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.

{'return': array([ 0.02626783,  0.02371112,  0.02733836, ...,  0.00593674,
        0.00254333,  0.00254333], dtype=float32), 'portfolio_id': array(['PF00001001', 'PF00001004', 'PF00001009', ..., 'PF00014123',
       'PF00014127', 'PF00014147'], dtype=object)}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [56]:
# df construction
sub_df = pd.DataFrame(data=submission, index=np.arange(len(submission['return'])))

In [57]:
print('Writing csv ...')
sub_df.to_csv('submission.csv', index=False, float_format='%.4f')

Writing csv ...
