## Air Quality Improved

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import catboost as catt
import xgboost as xgb
import lightgbm as lgb
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import train_test_split, KFold,GroupKFold,StratifiedKFold

from sklearn.metrics import mean_squared_error, mean_absolute_error

pd.options.display.float_format = '{:.5f}'.format
pd.options.display.max_rows = 2200
pd.options.display.max_columns = 2200

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import optuna

from geopy.geocoders import Nominatim
from geopy.point import Point
import requests
from io import StringIO 

  from pandas import MultiIndex, Int64Index


In [2]:
# Load files
path= './'
train = pd.read_csv(path + 'Train.csv', parse_dates = ['Date'])
test = pd.read_csv(path + 'Test.csv', parse_dates = ['Date'])
sub= pd.read_csv(path+ 'SampleSubmission.csv')


train.shape, test.shape

((30557, 82), (16136, 77))

In [3]:
train = train.sort_values(by=['Date','Place_ID']).reset_index(drop=True)
test = test.sort_values(by=['Date','Place_ID']).reset_index(drop=True)

In [4]:
cat_columns = [col for col in train.select_dtypes('object').columns]

num_columns = [col for col in train.select_dtypes(['int', 'float']).columns if col not in ['pm2_5']]

In [5]:
test = test[[col for col in test.columns if 'L3_CH4' not in col]]
train = train[[col for col in train.columns if 'L3_CH4' not in col]]




In [6]:
#CONCATENATING BOTH THE TEST AND TRAIN FOR EASY DATA MANIPULATON
ntrain = train.shape[0]
ntest = test.shape[0]
data = pd.concat((train, test)).reset_index(drop=True)

In [7]:
nan_cols = [c for c in data.columns if c not in ['Place_ID X Date', 'Date', 'Place_ID', 'target', 'target_min',
       'target_max', 'target_variance', 'target_count']]
nan_cols = data[nan_cols].columns[data[nan_cols].isnull().any()].tolist()

for col in nan_cols:
    
    while data[col].isnull().sum()>0:
     
        data[col].fillna(data[["Place_ID", col]].groupby(["Place_ID"]).shift(periods=0).fillna(method='ffill', limit=1).fillna(method='bfill', limit=1)[col], inplace=True) 

data.isnull().sum()[data.isnull().sum()>0]

target             16136
target_min         16136
target_max         16136
target_variance    16136
target_count       16136
dtype: int64

In [8]:
data['target'].loc[data['target'] >300]  = 300
data['target'].loc[data['target'] <12.5]  = 12.5

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le_columns = [col for col in data.select_dtypes('object').columns if col not in ['Place_ID X Date','']]

for col in le_columns:
   data[col] = le.fit_transform(data[col])

In [10]:
# Convert datetime variables to datetime objects
data["Date"] = pd.to_datetime(data.Date)


In [11]:
lag_cols =[ col for col in data.columns if col not in ['Place_ID X Date', 'Date', 'Place_ID', 'target', 'target_min',
       'target_max', 'target_variance', 'target_count']]

for col in lag_cols:
    for i in tqdm_notebook(range(1, 8)):
        data[f'prev_{col}_{i}'] = data.groupby('Place_ID')[col].shift(i)
        data[f'next_{col}_{i}'] = data.groupby('Place_ID')[col].shift(-i)
#         data[f'prevchange_{col}_{i}'] = data[col] - data[f'prev_{col}_{i}']
        

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [12]:
# TARGET_COL = 'target'
# for i in range(1, 8):
#     data[f'prev_target_{i}'] = data.sort_values(by='Date')[TARGET_COL].fillna(method='ffill').shift(i).sort_index()
#     data[f'next_target_{i}'] = data.sort_values(by='Date')[TARGET_COL].fillna(method='bfill').shift(-i).sort_index()

In [13]:
# data.head()

In [14]:
# for i in tqdm_notebook(range(1, 8)):
#     data[f'forward_mean_{i}'] = data.sort_values(by='Date')['target'].shift(i).expanding().mean().fillna(method='ffill').sort_index()
#     data[f'backward_mean_{i}'] = data.sort_values(by='Date')['target'].shift(-i).expanding().mean().fillna(method='bfill').sort_index()

In [15]:
for col in lag_cols:
    data[f'gb_feature_date{col}'] =data.groupby(['Date'])[col].transform('mean')


        

In [16]:
# Extract day, month year and hour from the Datetime column
# day
data['Datetime_day'] = data.Date.dt.day

# month
data['Datetime_month'] = data.Date.dt.month
 



data['weekday'] =data.Date.dt.weekday

data['month_day'] = data['Datetime_month'].astype(str) + '_' + data['Datetime_day'].astype(str)
data['month_wkday'] = data['Datetime_month'].astype(str) + '_' + data['weekday'].astype(str)



In [17]:

le_columns = ['month_day','month_wkday','Date']

for col in le_columns:
   data[col] = le.fit_transform(data[col])

In [18]:
cat_cols = ['Place_ID','Date'
           
           ]
## Count of unique features
for i in cat_cols:
    data['count_'+i] = data[i].map(data[i].value_counts())
# cat_cols

In [19]:
# Separate train and test data from the combined dataframe
# data = data.drop("ID", axis = 1)
train = data[:ntrain]
test = data[ntrain:]
train_df= train.copy()
test_df= test.copy()




# Check the shapes of the split dataset
train.shape, test.shape

((30557, 1087), (16136, 1087))

In [20]:
# train_df.corr()['target'].sort_values()[train_df.corr()['target'].sort_values() < 0]

In [21]:
# train_df.corr()['target'].sort_values()[train_df.corr()['target'].sort_values() > 0]

In [22]:

main_cols = train_df.columns.difference(['Place_ID X Date','target', 'target_min',
       'target_max', 'target_variance', 'target_count','Place_ID'])
    

In [23]:

# X = train_df[main_cols]
# y = train_df.target

# model = catt.CatBoostRegressor(random_state = 42,)

# import random
# random.seed(123)

# train_features, valid_features, train_y, valid_y = train_test_split(X, y, test_size = 0.18, stratify = X['Datetime_month'],random_state = 47)
# model.fit(train_features, train_y,eval_set = [(train_features, train_y),(valid_features,valid_y)], early_stopping_rounds = 500, 
#           verbose = 200)

In [24]:
# pd.options.display.max_rows = 2200
# #define a function to create variable importance dataframe
# def get_catt_varimp(model, train_columns, max_vars=350):
    
#     # Scikit-learn API LGBMClassifier or LGBMRegressor was fitted, 
#     # so using feature_importances_ property
#     feature_importances_perc = (model.feature_importances_ / sum(model.feature_importances_)) * 100
#     cv_varimp_df = pd.DataFrame([train_columns, feature_importances_perc]).T

#     cv_varimp_df.columns = ['feature_name', 'varimp (%)']

#     cv_varimp_df.sort_values(by='varimp (%)', ascending=False, inplace=True)

#     cv_varimp_df = cv_varimp_df.iloc[0:max_vars]   

#     return cv_varimp_df
# #
# get_catt_varimp(model, X.columns)

In [25]:
X = train_df[main_cols]
y = train_df['target']
X_test = test_df[main_cols]
X.shape,X_test.shape

((30557, 1080), (16136, 1080))

In [26]:


def objective(trial):
    fold_pred=[]
    oof_pred = []


    param = {}
    param['objective'] = "rmse"

    param["learning_rate"] = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
#     param['reg_lambda'] = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
#     param['reg_alpha'] = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    param['subsample'] = trial.suggest_float("subsample", 0.1, 1.0)
    param['colsample_bytree'] = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    param['max_depth'] = trial.suggest_int("max_depth", 5, 20)
    # param['scale_pos_weight'] = trial.suggest_int('scale_pos_weight', 3, 10)




    fold=StratifiedKFold(n_splits=10)#15#5#10
    i=1
    for train_index, test_index in fold.split(X,X['Datetime_month']):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = (y.iloc[train_index]), y.iloc[test_index]
        # m2 = CatBoostRegressor(**param)
        m2 = lgb.LGBMRegressor(**param, n_estimators=2000)

        m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=200, verbose = False)#erly100
        preds=m2.predict(X_test)
        oof_pred.append(mean_squared_error(y_test,(preds), squared=False))

    

    return np.mean(oof_pred)



In [27]:
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=100)

In [28]:
# print("Number of finished trials: {}".format(len(study.trials)))

# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

In [29]:
# trial.params   #best params

In [30]:


fold_pred=[]
oof_pred = []

param = {'learning_rate': 0.03531634926079207,
 'subsample': 0.36275907211034,
 'colsample_bytree': 0.2822120227878022,
 'max_depth': 16}

fold= StratifiedKFold(n_splits=10)#15#5#10
i=1
for train_index, test_index in fold.split(X,X['Datetime_month']):     
  
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = (y.iloc[train_index]), y.iloc[test_index]

    model = lgb.LGBMRegressor(**param,objective='rmse', n_estimators=2000)

    model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=200, verbose = False)#erly100

    preds= model.predict(X_test)
    print("err: ",(mean_squared_error(y_test,(preds), squared=False)))  #Reverse transformation
    oof_pred.append(mean_squared_error(y_test,(preds),squared=False))
    p2 = model.predict(test_df[X.columns])
    fold_pred.append((p2))
    

print(np.mean(oof_pred))



err:  28.961173616846743
err:  24.44802691064127
err:  24.225940715505054
err:  24.444868819364874
err:  24.628316969410015
err:  23.838539013760037
err:  23.398411964797937
err:  23.983564636947595
err:  26.127627404679252
err:  23.033734668370894
24.70902047203237


31.602061259577983

In [31]:
sub_df = test_df[['Place_ID X Date']].copy()
sub_df['target'] = np.mean(fold_pred, axis = 0)
sub_df.head()

Unnamed: 0,Place_ID X Date,target
30557,0OS9LVX X 2020-01-02,43.59879
30558,0Q2LTOG X 2020-01-02,102.72293
30559,15WCXN1 X 2020-01-02,190.52692
30560,19090SM X 2020-01-02,106.60923
30561,1BD5TVT X 2020-01-02,77.51863


In [32]:
sub_df.to_csv('lgb_iamtired.csv', index=False)


## CatBoost Modelling

In [34]:
fold_pred=[]
oof_pred = []

params = {'max_depth': 5, 'learning_rate': 0.054311625828468046, 'l2_leaf_reg': 0.023393133390917083,'iterations':10000}





fold= StratifiedKFold(n_splits=10)#15#5#10
i=1
for train_index, test_index in fold.split(X,X['Datetime_month']):     
  
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = (y.iloc[train_index]), y.iloc[test_index]

    model = catt.CatBoostRegressor(**params,eval_metric='RMSE',random_state=42)
    model.fit(X_train,y_train,eval_set=[(X_test, y_test)], early_stopping_rounds=250, verbose = False)#erly100

    preds= model.predict(X_test)
    print("err: ",(mean_squared_error(y_test,(preds), squared=False)))  #Reverse transformation
    oof_pred.append(mean_squared_error(y_test,(preds),squared=False))
    p2 = model.predict(test_df[X.columns])
    fold_pred.append((p2))
    

print(np.mean(oof_pred))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

err:  28.245909043275752
err:  24.445605647169092
err:  24.41352497865531
err:  24.215332960827027
err:  23.96650828404815
err:  23.794847616032087
err:  22.998438738073865
err:  23.38874159606381
err:  25.82744366436217
err:  22.971144355403514
24.426749688391077


25.65513541690664 - 25.642662237542975 - 24.70872401160831

In [35]:
sub_df = test_df[['Place_ID X Date']].copy()
sub_df['target'] = np.mean(fold_pred, axis = 0)
sub_df.head()

Unnamed: 0,Place_ID X Date,target
30557,0OS9LVX X 2020-01-02,47.84196
30558,0Q2LTOG X 2020-01-02,97.07852
30559,15WCXN1 X 2020-01-02,210.33255
30560,19090SM X 2020-01-02,115.99422
30561,1BD5TVT X 2020-01-02,79.06202


In [36]:
sub_df.to_csv('cat_iamtired.csv', index=False)


## Ensembling

In [37]:
first = pd.read_csv('lgb_iamtired.csv')
second = pd.read_csv('cat_iamtired.csv')
ens = first.copy()

In [38]:
ens['target'] = first['target']*0.6 + second['target']*0.4
ens.to_csv('ensemble_1.csv', index=False)

In [39]:
ens['target'] = second['target']*0.6 + first['target']*0.4
ens.to_csv('ensemble_2.csv', index=False)