Baseline Notebook for Time Series projects.

Source: 
- [Time Series Forecasting by Rob Mulla](https://www.youtube.com/watch?v=z3ZnOW-S550)


# 0.0 Imports

In [1]:
# !pip install category_encoders

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

from sklearn.metrics         import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold, GroupKFold
from sklearn.impute          import SimpleImputer, KNNImputer

from sklearn.linear_model    import Lasso, Ridge, LinearRegression
from sklearn.preprocessing   import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


import category_encoders as ce


color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

# 1.0 Data Reading

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df_sub = test[['ID']].copy()

In [6]:
ss = StandardScaler()
mm = MinMaxScaler()

def create_features(df, train=True):
    """
    Adds features to training or test set.   
    """
    
    # drop features not in test set
    df = df.drop(['preco'], axis=1)    
    

    #    np.full((train.shape[0], ), 2022)
    
    # MILEAGE & YEAR GAP
    df['ano_modelo']     = df['ano_modelo'].astype(int)
    
#     df['yearGapFab'] = (np.ones(df.shape[0])*2022 - df['ano_de_fabricacao']) + 1
#     df['yearGapMod'] = (np.ones(df.shape[0])*2022 - df['ano_modelo'])        + 1
    
    df['yearGapFab'] = 2024 - df['ano_de_fabricacao']
    df['yearGapMod'] = 2024 - df['ano_modelo']    
    
    
    df['mileageFab'] = df['odometro'] / df['yearGapFab']
    df['mileageMod'] = df['odometro'] / df['yearGapMod']    
    df['mileage'] = df['odometro'] / (df['yearGapFab']*0.5 + df['yearGapMod']*0.5)
    
    df['mileageYearFab'] = df['odometro'] / df['ano_de_fabricacao']
    df['mileageYearMod'] = df['odometro'] / df['ano_modelo']
    
    
    df['ano_modelo']        = ss.fit_transform(df[['ano_modelo']].values)
    df['ano_de_fabricacao'] = ss.fit_transform(df[['ano_de_fabricacao']].values)
    

    # FEATURES SCALING
    
#     featagg1 = df.groupby("ano_de_fabricacao")['odometro'].agg(['mean', 'std', 'min', 'max', 'last'])
#     df['featagg1'] = df['ano_de_fabricacao'].map(featagg1)
    
#     featagg2 = df.groupby("ano_de_fabricacao")['cambio'].agg(['count', 'nunique'])
#     df['featagg2'] = df['ano_de_fabricacao'].map(featagg2)

    
    
    df['odometro']       = mm.fit_transform(df[['odometro']].values)
   
    df['mileageFab'] = mm.fit_transform(df[['mileageFab']].values) 
    df['mileageMod'] = mm.fit_transform(df[['mileageMod']].values) 
    
    df['mileage'] = mm.fit_transform(df[['mileage']].values)    
    
    df['mileageYearFab'] = mm.fit_transform(df[['mileageYearFab']].values)
    df['mileageYearMod'] = mm.fit_transform(df[['mileageYearMod']].values)




    df['4x4'] =    df['versao'].str.extract(r'(4X4)')
    df['is_V6'] =  df['versao'].str.extract(r'(V6)')
    df['is_V8'] =  df['versao'].str.extract(r'(V8)')
    df['is_V10'] = df['versao'].str.extract(r'(V10)')

    df['4x4'] =      df['4x4'].apply(lambda x: 1 if x == '4X4'else 0)
    df['is_V6'] =    df['is_V6'].apply(lambda x:1 if x == 'V6'else 0)
    df['is_V8'] =    df['is_V8'].apply(lambda x:1 if x == 'V8'else 0)
    df['is_V10'] =   df['is_V10'].apply(lambda x:1 if x == 'V10' else 0)
    df['is_basic'] = df.apply(lambda x: 1 if  x['is_V6'] == 0 and 
                                                x['is_V8'] == 0 and 
                                                x['is_V10'] == 0 else 0 ,axis=1)
    
    
    df['is_turbo']     = df['versao'].str.extract(r'\W*(TURBO)\W*')
    df['is_flex']      = df['versao'].str.extract(r'\W*(FLEX)\W*') 
    df['is_gasolina']  = df['versao'].str.extract(r'\W*(GASOLINA)\W*')
    df['is_hybrid']    = df['versao'].str.extract(r'\W*(HÍ)\W*')
    df['is_hybrid_2']  = df['versao'].str.extract(r'\W*(HY)\W*')
    df['is_diesel']    = df['versao'].str.extract(r'\W*(DIESEL)\W*')
    # df2['is_tsi'] = x.str.extract(r'\W*(TSI)\W*')

    # df2['is_turbo'] = df2.apply(lambda x:1 if x['is_turbo'] == 'TURBO' or x['is_tsi'] == 'TSI' else 0,axis=1)
    
    df['is_turbo'] = df['is_turbo'].apply(lambda x:1 if x == 'TURBO' else 0)

    df['is_flex'] = df['is_flex'].apply(lambda x: 1 if x == 'FLEX' else 0)
    
    df['is_gasolina'] = df['is_gasolina'].apply(lambda x: 1 if x == 'GASOLINA' else 0)
    
    df['is_diesel'] = df['is_diesel'].apply(lambda x: 1 if x == 'DIESEL' else 0)
    
    df['is_hybrid_final'] = df.apply(lambda x: 1 if x['is_hybrid'] == 'HÍ' or x['is_hybrid_2'] == 'HY' else 0,axis=1)
    
    df['is_flex_final'] = df.apply(lambda x: 1 if x['is_flex'] == 'FLEX' else
                                              1 if x['is_gasolina'] == 0 and x['is_diesel'] == 0 and x['is_hybrid_final'] == 0 else 0,axis=1)

    
    df['check'] = df['is_flex_final'] + df['is_gasolina'] + df['is_hybrid_final'] + df['is_diesel'] + df['is_turbo']
    
    
    
    # MARCA // BRAND - FREQUENCY ENCODING
    
    fe_marca = df.groupby('marca').size()/len(df)
    df['marca_freq'] = df['marca'].map(fe_marca)
    
    df['num_fotos'] = df['num_fotos'].fillna(0)
    df['num_fotos'] = df['num_fotos'].astype(int)
       
        
    df['blindado'] = df['blindado'].apply(lambda x: 1 if x == 'S' else 0)
        
    
    # REGION
    estados  ={
                'AC':'Norte',
                'AL':'Nordeste',
                'AP':'Norte',
                'AM':'Norte',
                'BA':'Nordeste',
                'CE':'Nordeste',
                'DF':'Centro-Oeste',
                'ES':'Sudeste',
                'GO':'Centro-Oeste',
                'MA':'Nordeste',
                'MT':'Centro-Oeste',
                'MS':'Centro-Oeste',
                'MG':'Sudeste',
                'PA':'Norte',
                'PB':'Nordeste',
                'PR':'Sul',
                'PE':'Nordeste',
                'PI':'Nordeste',
                'RJ':'Sudeste',
                'RN':'Nordeste',
                'RS':'Sul',
                'RO':'Norte',
                'RR':'Norte',
                'SC':'Sul',
                'SP':'Sudeste',
                'SE':'Nordeste',
                'TO':'Norte'}
    
    
    df['sigla'] = df['estado_vendedor'].apply(lambda x: x.split(' ')[-1].replace('(','').replace(')',''))
    df['regiao'] = df['sigla'].map(estados)
    
    # CAMBIO     
    df['cambio'] = df['cambio'].apply(lambda x: 'Manual' if x == 'Manual' else 
                                            'CVT'    if x == 'CVT'    else
                                            'Automático')
    df['cambio'] = df['cambio'].apply(lambda x: 'Automatico' if x == 'Automático' else x)
    df['tipo'] = df['tipo'].apply(lambda x: 'Conversivel' if x == 'Conversível' else 
                                            'Cupe'        if x == 'Cupê'        else
                                            'Seda'        if x == 'Sedã'        else
                                             'Utilitario esportivo' if x == 'Utilitário esportivo' else x)
    
    df['tipo_anuncio'] = df['tipo_anuncio'].apply(lambda x: 'Acessorios e servicos para autos' if x == 'Acessórios e serviços para autos' else
                                                            'Concessionaria'                  if x == 'Concessionária' else
                                                            'Pessoa Fisica'                   if x == 'Pessoa Física' else x)
    

    # MOTOR PT 1
    
    df['motor'] = df['versao'].str[:3]
    
    df['motor'] = df['motor'].apply(lambda x: 0 if x == 'REX' or x == 'B12' or x == 'EV ' or x == '4S ' or x == 'BEV' or x == '90 ' or x == 'ELÉ' or x == 'TUR' or x == 'Z.E' or x == 'P8 ' else x)
    df['eletric'] = df['motor'].apply(lambda x: 1 if x == 0 else 0)
    
    
    # MOTOR PT 2
    
    df['motor'] = df['motor'].astype(float)

    

    
    # ATTR 
    df['attr_veiculo_aceita_troca'] =                                    df['attr_veiculo_aceita_troca'].apply(lambda x: 1 if x == 'Aceita troca' else 0)
    df['attr_veiculo_unico_dono'] =                                    df['attr_veiculo_único_dono'].apply(lambda x: 1 if x == 'Único dono' else 0)
    df['attr_veiculo_todas_as_revisoes_feitas_pela_concessionaria'] =   df['attr_veiculo_todas_as_revisões_feitas_pela_concessionária'].apply(lambda x: 1 if x == 'Todas as revisões feitas pela concessionária' else 0)
    df['attr_veiculo_ipva_pago'] =                                      df['attr_veiculo_ipva_pago'].apply(lambda x: 1 if x == 'IPVA pago' else 0)
    df['attr_veiculo_licenciado'] =                                     df['attr_veiculo_licenciado'].apply(lambda x: 1 if x == 'Licenciado' else 0)
    df['attr_veiculo_garantia_de_fabrica'] =                           df['attr_veiculo_garantia_de_fábrica'].apply(lambda x: 1 if x == 'Garantia de fábrica' else 0)
    df['attr_veiculo_todas_as_revisoes_feitas_pela_agenda_do_carro'] =  df['attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro'].apply(lambda x: 1 if x == 'Todas as revisões feitas pela agenda do carro' else 0)
 

    df.drop(columns=['attr_veiculo_único_dono','attr_veiculo_todas_as_revisões_feitas_pela_concessionária','attr_veiculo_garantia_de_fábrica','attr_veiculo_todas_as_revisões_feitas_pela_agenda_do_carro'], axis=1, inplace=True)



    df = pd.get_dummies(df,prefix=['cambio'], columns=['cambio'])
    df = pd.get_dummies(df,prefix=['tipo'],   columns=['tipo'])
    df = pd.get_dummies(df,prefix=['cor'],    columns=['cor'])
    df = pd.get_dummies(df,prefix=['tipo_vendedor'],columns=['tipo_vendedor'])
    df = pd.get_dummies(df,prefix=['tipo_anuncio'],columns=['tipo_anuncio'])
    df = pd.get_dummies(df,prefix=['regiao'],      columns=['regiao'])
#     df = pd.get_dummies(df,prefix=['motor'],      columns=['motor'], drop_first=False)
    
    
#     # Binary Encoder for 'modelo'
#     biencoder = ce.BinaryEncoder(cols = ['modelo'] )
#     dfbin = biencoder.fit_transform(df['modelo'])
#     df = pd.concat([df, dfbin], axis=1)

#     target_region_code = df.groupby('modelo')['preco'].mean()
#     df['modelo_target'] = df['modelo'].map(target_region_code)
    
    return df

In [7]:
%%time

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df_sub = test[['ID']].copy()

train['isTrain'] = True
test['isTrain'] = False
tt = pd.concat([train, test]).reset_index(drop=True).copy()
tt = create_features(tt)
train = tt.query('isTrain').reset_index(drop=True).copy()
test = tt.query('isTrain == False').reset_index(drop=True).copy()

CPU times: user 3.58 s, sys: 63.6 ms, total: 3.65 s
Wall time: 3.64 s


# 3.0 Feature Engineering

In [9]:
train = pd.concat([train, df['preco']], axis=1)

In [11]:
target_region_code = train.groupby('modelo')['preco'].mean()
train['modelo_target'] = train['modelo'].map(target_region_code)

test['modelo_target'] = test['modelo'].map(target_region_code)

# 4.0 Modeling

In [14]:
print(train.shape)
print(test.shape)

(39446, 85)
(39446, 84)


## 4.1 Train Using KFold CV

In [2]:
FEATURES = [
#             'ID', 
            'num_fotos', 
#             'marca', # FREQ ENCODER
#             'modelo', 
#             'versao', DROP
            'ano_de_fabricacao',
            'ano_modelo', 
            'odometro', 
#             'cambio', ONE HOT
            'num_portas', 
#             'tipo',  ONE HOT
            'blindado',
#             'cor',   ONE HOT
#             'tipo_vendedor',  ONE HOT
#             'cidade_vendedor', REGIAO
#             'estado_vendedor', REGIAO
#             'tipo_anuncio',  ONE HOT
            'entrega_delivery', 
            'troca', 
            'elegivel_revisao',
            'attr_veiculo_aceita_troca', 
            'attr_veiculo_unico_dono',
            'attr_veiculo_todas_as_revisoes_feitas_pela_concessionaria',
            'attr_veiculo_ipva_pago', 
            'attr_veiculo_licenciado',
            'attr_veiculo_garantia_de_fabrica',
            'attr_veiculo_todas_as_revisoes_feitas_pela_agenda_do_carro',
#             'attr_veiculo_alienado', 
#             'preco',
#             'sigla', 
              'motor', 
              'eletric',
#               'yearGapFab', 'yearGapMod', 
#             'mileage',
#             'mileageFab', 'mileageMod',
#               'mileageYearFab', 'mileageYearMod',
    
            '4x4', 'is_V6', 'is_V8', 'is_V10', 'is_basic',
    
              'is_turbo', 
#               'is_flex', 
              'is_gasolina', 
#               'is_hybrid', 'is_hybrid_2',
              'is_diesel', 
              'is_hybrid_final', 
              'is_flex_final', 
#               'check',
              'marca_freq', # SUBSTITUI MARCA!
              'cambio_Automatico', 
              'cambio_CVT', 'cambio_Manual',
              'tipo_Conversivel',
              'tipo_Cupe',
              'tipo_Hatchback', 'tipo_Minivan',
              'tipo_Perua/SW', 'tipo_Picape', 
              'tipo_Seda',
              'tipo_Utilitario esportivo',
              'cor_Azul', 'cor_Branco', 'cor_Cinza',
              'cor_Dourado', 'cor_Prata', 'cor_Preto', 
              'cor_Verde', 
              'cor_Vermelho',
              'tipo_vendedor_PF', 'tipo_vendedor_PJ',
              'tipo_anuncio_Acessorios e servicos para autos',
              'tipo_anuncio_Concessionaria', 
              'tipo_anuncio_Loja',
              'tipo_anuncio_Pessoa Fisica',
#               'regiao_Centro-Oeste', 'regiao_Nordeste',
#               'regiao_Norte', 'regiao_Sudeste', 'regiao_Sul',
#                'motor_1.0',
#              'motor_1.1-1.4', 'motor_1.5-1.8', 'motor_1.9-2.0', 'motor_2.1-2.8',
#             'motor_2.8-3.2', 'motor_Eletrico', 'motor_above-3.2',
#               'modelo_0', 'modelo_1',
#               'modelo_2', 'modelo_3', 'modelo_4', 'modelo_5', 'modelo_6', 'modelo_7',
#               'modelo_8',
              'modelo_target',
#               'marca_target',
#               'featagg1', 'featagg2', 'featagg3'
           ]

X = train.copy()
TARGET = 'preco'
FOLDS = 5

NameError: name 'train' is not defined

In [21]:
maes = []
preds_test = []
feature_importance = []
oof_pred = []; oof_tar = []; oof_names = []; oof_folds = [] 

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for fold, (tr, val) in enumerate(kf.split(X[FEATURES], X[TARGET])):
    print('Fold:', fold)
    

    
    # split train/val
    X_train = X.loc[tr, FEATURES]
    y_train = X.loc[tr, TARGET]
    X_val = X.loc[val, FEATURES]
    y_val = X.loc[val, TARGET]
    
    

    # LGBM - PARAMETERS OBTAINED FROM OPTUNA TUNING
    lgb = lgbm.LGBMRegressor(
        learning_rate=0.01585496083489399,
        objective='l1',
        n_estimators=5000,
        reg_alpha=0.3803917808316236,
        reg_lambda=1.879660691225598,
        max_depth=9,
        subsample=0.6792710794478111,
        num_leaves=512,
        colsample_bytree=0.5661872952051302  
    
    )
    lgb.fit(X_train, y_train)
                                    
    # predict
    preds = lgb.predict(X_val)   
    
    
    # metrics
    mae = mean_absolute_error(y_val, preds)
    maes.append(mae)
    print('MAE:', mae)
    print('R2:', r2_score(y_val, preds))
    print()
    
    # predict on test set
    preds_test = lgb.predict(test[X_train.columns])
    df_sub[f'fold_{fold}'] = preds_test    
    
    
print('-' * 20)
print('Mean:', np.mean(maes))
print('Median:', np.median(maes))
print('Std:', np.std(maes))


Fold: 0
MAE: 25544.24467882406
R2: 0.7453177818517753

Fold: 1
MAE: 24979.785259715383
R2: 0.7525005303347178

Fold: 2
MAE: 25383.37203149063
R2: 0.7439699159161834

Fold: 3
MAE: 24560.282720262698
R2: 0.7564185536809318

Fold: 4
MAE: 24785.411395433497
R2: 0.7789890056504281

--------------------
Mean: 25050.61921714525
Std: 366.10737503353084


## 4.2 Training other models - XGB

In [None]:
# param = { 
# #         'n_estimators':[1500, 1700, 2500, 3000, 3500],
#         'eta':[0.01,0.03],
# #         'max_depth':[3, 5, 9],
# #         'subsample':[0.1, 0.5, 0.7],
# #         'colsample_bytree':[0.3, 0.7, 0.9],
# #         'min_child_weight':[3, 8, 15]
        }

In [None]:
maes = []
preds_test = []
feature_importance = []
oof_pred = []; oof_tar = []; oof_names = []; oof_folds = [] 

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for fold, (tr, val) in enumerate(kf.split(X[FEATURES], X[TARGET])):
    print('Fold:', fold)
    
    # split train/val
    X_train = train.loc[tr, FEATURES]
    y_train = train.loc[tr, TARGET]
    X_val = train.loc[val, FEATURES]
    y_val = train.loc[val, TARGET]
    

    # LGBM
    model_xgb = xgb.XGBRegressor(
#         objective='l1',
        n_estimators=5000,
        learning_rate=0.015,
#         eta=hp['eta']
        max_depth=9,
#         num_leaves=512
    )
    model_xgb.fit(X_train, y_train)
                                    
    # predict
    preds = model_xgb.predict(X_val)
    
    
    # metrics
    mae = mean_absolute_error(y_val, preds)
    maes.append(mae)
    print('MAE:', mae)
    print('R2:', r2_score(y_val, preds))
    print()
    
    # predict on test set
    preds_test = model_xgb.predict(test[X_train.columns])
    df_sub[f'fold_{fold}'] = preds_test
    
#     feature_importance.append(lgb.feature_importance_())
    
    
    
print('-' * 20)
print('Mean:', np.mean(maes))
print('Median:', np.median(maes))
print('Std:', np.std(maes))

## 4.3 Training other models - CatBoost

In [None]:
maes = []
preds_test = []
feature_importance = []
oof_pred = []; oof_tar = []; oof_names = []; oof_folds = [] 

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for fold, (tr, val) in enumerate(kf.split(X[FEATURES], X[TARGET])):
    print('Fold:', fold)
    
    # split train/val
    X_train = train.loc[tr, FEATURES]
    y_train = train.loc[tr, TARGET]
    X_val = train.loc[val, FEATURES]
    y_val = train.loc[val, TARGET]
    

    # LGBM
    catboost = cb.CatBoostRegressor(
        iterations=5000,
#         cat_features=categorical
    )
    catboost.fit(X_train, y_train)
                                    
    # predict
    preds = catboost.predict(X_val)
    
    
    # metrics
    mae = mean_absolute_error(y_val, preds)
    maes.append(mae)
    print('MAE:', mae)
    print('R2:', r2_score(y_val, preds))
    print()
    
    # predict on test set
    preds_test = catboost.predict(test[X_train.columns])
    df_sub[f'fold_{fold}'] = preds_test
    
#     feature_importance.append(lgb.feature_importance_())
    
    
    
print('-' * 20)
print('Mean:', np.mean(maes))
print('Median:', np.median(maes))
print('Std:', np.std(maes))

# 5.0 GridSearch and Feature Importances

In [None]:
lgbm.plot_importance(lgb, max_num_features = 10, height=.9);

In [None]:
# %%time

# lgb2 = lgbm.LGBMRegressor(random_state=42)
# gs = GridSearchCV (lgb2, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=True)
# gs.fit(X[FEATURES], y)

# maet = mean_absolute_error(y_train, gs.predict(X_train))
# mae = mean_absolute_error(y_test, gs.predict(X_test))

# print("best parameters: ",gs.best_params_)
# print("best score: ", gs.best_score_)


# print(f'MAE train set: {maet}, MAE test set: {mae}')

In [None]:
# print("""
# best parameters:  {'learning_rate': 0.05, 'max_depth': 12, 'n_estimators': 1000, 'reg_alpha': 10, 'reg_lambda': 0.8, 'subsample': 0.5}
# best score:  -26237.85915732369
# MAE train set: 22383.14791489919, MAE test set: 22625.48454911989
# CPU times: user 1d 8min 43s, sys: 2min 44s, total: 1d 11min 28s
# Wall time: 2h 7min 50s
# """)

In [None]:
# gs.cv_results_

In [None]:
# gs.cv_results_['mean_test_score'].mean()

# 6.0 Submission

In [23]:
# Save model
lgb.save_model('model_lgb.json')

In [26]:
df_sub.head()

Unnamed: 0,ID,fold_0,fold_1,fold_2,fold_3,fold_4
0,24813264385557040124808779273028388499,67155.225966,67186.741261,66247.209158,67708.655823,66167.898408
1,295636316453795508942188530111300065666,105809.113493,102890.851615,103005.792747,107047.383118,101236.83443
2,101258309166227950735244624080888109884,88734.928559,87630.126377,87929.481485,88052.632837,90828.595065
3,28348734455782469411126661985772047409,73562.068977,76831.34721,72480.254374,72616.502702,75857.882834
4,193163160502972147671913739170248305797,99902.168138,100443.789464,97705.923833,98960.887857,96152.968403


In [34]:
pred_cols = [x for x in df_sub.columns if x.startswith('fold_')]

# take median because metric is MAE
df_sub['preco'] = df_sub[pred_cols].median(axis=1)

In [35]:
df_sub.head()

Unnamed: 0,ID,fold_0,fold_1,fold_2,fold_3,fold_4,preco
0,24813264385557040124808779273028388499,67155.225966,67186.741261,66247.209158,67708.655823,66167.898408,67155.225966
1,295636316453795508942188530111300065666,105809.113493,102890.851615,103005.792747,107047.383118,101236.83443,103005.792747
2,101258309166227950735244624080888109884,88734.928559,87630.126377,87929.481485,88052.632837,90828.595065,88052.632837
3,28348734455782469411126661985772047409,73562.068977,76831.34721,72480.254374,72616.502702,75857.882834,73562.068977
4,193163160502972147671913739170248305797,99902.168138,100443.789464,97705.923833,98960.887857,96152.968403,98960.887857


In [36]:
sub = pd.read_csv('data/sample_submission.csv')

In [37]:
sub['preco'] = df_sub['preco']

In [38]:
sub.head()

Unnamed: 0,ID,preco
0,24813264385557040124808779273028388499,67155.225966
1,295636316453795508942188530111300065666,103005.792747
2,101258309166227950735244624080888109884,88052.632837
3,28348734455782469411126661985772047409,73562.068977
4,193163160502972147671913739170248305797,98960.887857


In [40]:
sub.to_csv('submission.csv', index=False)