# Importing libraries

In [None]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

%matplotlib inline

# Loading data

In [None]:
train = pd.read_csv('../input/urban-air-pollution-challenge-by-zindiweekendz/Train (9).csv')
test = pd.read_csv('../input/urban-air-pollution-challenge-by-zindiweekendz/Test (4).csv')

# Feature engineering

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [None]:
train['year'] = train['Date'].dt.year
train['month'] = train['Date'].dt.month
train['week'] = train['Date'].dt.week
train['day'] = train['Date'].dt.day

test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['week'] = test['Date'].dt.week
test['day'] = test['Date'].dt.day

In [None]:
train['total_days_month'] = train['month'].apply(lambda x: 31 if x==1 else (28+31 if x==2 else (28+31+31 if x==1 else 28+30+31+31))) 
train['total_days'] = train['total_days_month'] + train['day']

test['total_days_month'] = test['month'].apply(lambda x: 31 if x==1 else (28+31 if x==2 else (28+31+31 if x==1 else 28+30+31+31))) 
test['total_days'] = test['total_days_month'] + test['day']

In [None]:
train.drop('Date', axis=1, inplace=True)
test.drop('Date', axis=1, inplace=True)

In [None]:
corrmat = train.corr()
corrmat[corrmat['target']>0.2].target.index

In [None]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree) 
train['L3_NO2_sensor_azimuth_angle'] = radian_conv(train['L3_NO2_sensor_azimuth_angle'])
train['L3_NO2_sensor_zenith_angle'] = radian_conv(train['L3_NO2_sensor_zenith_angle'])
train['L3_NO2_solar_azimuth_angle'] = radian_conv(train['L3_NO2_solar_azimuth_angle'])
train['L3_NO2_solar_zenith_angle'] = radian_conv(train['L3_NO2_solar_zenith_angle'])
train['L3_O3_sensor_azimuth_angle'] = radian_conv(train['L3_O3_sensor_azimuth_angle'])
train['L3_O3_sensor_zenith_angle'] = radian_conv(train['L3_O3_sensor_zenith_angle'])
train['L3_O3_solar_azimuth_angle'] = radian_conv(train['L3_O3_solar_azimuth_angle'])
train['L3_O3_solar_zenith_angle'] = radian_conv(train['L3_O3_solar_zenith_angle'])
train['L3_CO_sensor_azimuth_angle'] = radian_conv(train['L3_CO_sensor_azimuth_angle'])
train['L3_CO_sensor_zenith_angle'] = radian_conv(train['L3_CO_sensor_zenith_angle'])
train['L3_CO_solar_azimuth_angle'] = radian_conv(train['L3_CO_solar_azimuth_angle'])
train['L3_CO_solar_zenith_angle'] = radian_conv(train['L3_CO_solar_zenith_angle'])

test['L3_NO2_sensor_azimuth_angle'] = radian_conv(test['L3_NO2_sensor_azimuth_angle'])
test['L3_NO2_sensor_zenith_angle'] = radian_conv(test['L3_NO2_sensor_zenith_angle'])
test['L3_NO2_solar_azimuth_angle'] = radian_conv(test['L3_NO2_solar_azimuth_angle'])
test['L3_NO2_solar_zenith_angle'] = radian_conv(test['L3_NO2_solar_zenith_angle'])
test['L3_O3_sensor_azimuth_angle'] = radian_conv(test['L3_O3_sensor_azimuth_angle'])
test['L3_O3_sensor_zenith_angle'] = radian_conv(test['L3_O3_sensor_zenith_angle'])
test['L3_O3_solar_azimuth_angle'] = radian_conv(test['L3_O3_solar_azimuth_angle'])
test['L3_O3_solar_zenith_angle'] = radian_conv(test['L3_O3_solar_zenith_angle'])
test['L3_CO_sensor_azimuth_angle'] = radian_conv(test['L3_CO_sensor_azimuth_angle'])
test['L3_CO_sensor_zenith_angle'] = radian_conv(test['L3_CO_sensor_zenith_angle'])
test['L3_CO_solar_azimuth_angle'] = radian_conv(test['L3_CO_solar_azimuth_angle'])
test['L3_CO_solar_zenith_angle'] = radian_conv(test['L3_CO_solar_zenith_angle'])

In [None]:
data=pd.concat([train,test],axis=0)
data

In [None]:
label_encoder =LabelEncoder() 
data['Place_ID']= label_encoder.fit_transform(data['Place_ID']) 

In [None]:
a=[i for i in range(1,46694)]
data['num'] = a

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['Place_ID','num',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['Place_ID','num',col+'_lag_'+str(i)]
        shifted['num'] += i
        df = pd.merge(df, shifted, on=['Place_ID','num'], how='left')
    return df

In [None]:
data = lag_feature(data, [1,2,3,4,5,6], 'precipitable_water_entire_atmosphere')
data = lag_feature(data, [1,2,3,4,5,6], 'temperature_2m_above_ground')
data = lag_feature(data, [1,2,3,30], 'L3_NO2_NO2_column_number_density')
data = lag_feature(data, [1,2,3,4,5,30], 'L3_O3_O3_column_number_density')
data = lag_feature(data, [1,2,3,4,5,6,30], 'L3_HCHO_tropospheric_HCHO_column_number_density')
data = lag_feature(data, [1,2,3,4,5,30], 'L3_CO_CO_column_number_density')
data = lag_feature(data, [1,2,3,4,5,30], 'L3_CLOUD_surface_albedo')
data = lag_feature(data, [1,2,3,4,5,30], 'L3_CLOUD_cloud_optical_depth')
data = lag_feature(data, [1,2,3], 'relative_humidity_2m_above_ground')
data = lag_feature(data, [10,20,30], 'L3_AER_AI_sensor_altitude')
data = lag_feature(data, [1,2,3,4,30], 'L3_NO2_tropopause_pressure')

In [None]:
train=data.iloc[:30557 ,:]
test=data.iloc[30557:,]

# Modeling

In [None]:
columns=['precipitable_water_entire_atmosphere', 'relative_humidity_2m_above_ground','precipitable_water_entire_atmosphere_lag_1',
      'precipitable_water_entire_atmosphere_lag_2','precipitable_water_entire_atmosphere_lag_3','precipitable_water_entire_atmosphere_lag_4',
 'precipitable_water_entire_atmosphere_lag_5',
       'precipitable_water_entire_atmosphere_lag_6',
        'temperature_2m_above_ground_lag_1',
         'temperature_2m_above_ground_lag_2',
         'temperature_2m_above_ground_lag_3',
         'temperature_2m_above_ground_lag_4',
         'temperature_2m_above_ground_lag_5',
         'temperature_2m_above_ground_lag_6',
         'L3_NO2_NO2_column_number_density_lag_1',
         'L3_NO2_NO2_column_number_density_lag_2',
         'L3_NO2_NO2_column_number_density_lag_3',
         'L3_NO2_NO2_column_number_density_lag_30',
         
         'L3_O3_O3_column_number_density_lag_1',
       'L3_O3_O3_column_number_density_lag_2',
       'L3_O3_O3_column_number_density_lag_3',
         'L3_O3_O3_column_number_density_lag_4',
         'L3_O3_O3_column_number_density_lag_5',
         'L3_O3_O3_column_number_density_lag_30',
         
         'L3_HCHO_tropospheric_HCHO_column_number_density_lag_1',
       'L3_HCHO_tropospheric_HCHO_column_number_density_lag_2',
       'L3_HCHO_tropospheric_HCHO_column_number_density_lag_3',
         'L3_HCHO_tropospheric_HCHO_column_number_density_lag_4',
       'L3_HCHO_tropospheric_HCHO_column_number_density_lag_5',
         'L3_HCHO_tropospheric_HCHO_column_number_density_lag_6',
         'L3_HCHO_tropospheric_HCHO_column_number_density_lag_30',
         
         'L3_CO_CO_column_number_density_lag_1',
       'L3_CO_CO_column_number_density_lag_2',
       'L3_CO_CO_column_number_density_lag_3',
       'L3_CO_CO_column_number_density_lag_4',
       'L3_CO_CO_column_number_density_lag_5',
         'L3_CO_CO_column_number_density_lag_30',
         
         'L3_CLOUD_surface_albedo_lag_1',
       'L3_CLOUD_surface_albedo_lag_2', 'L3_CLOUD_surface_albedo_lag_3',
       'L3_CLOUD_surface_albedo_lag_4', 'L3_CLOUD_surface_albedo_lag_5',
         'L3_CLOUD_surface_albedo_lag_30',
         
         
         'L3_CLOUD_cloud_optical_depth_lag_1',
       'L3_CLOUD_cloud_optical_depth_lag_2',
       'L3_CLOUD_cloud_optical_depth_lag_3',
         'L3_CLOUD_cloud_optical_depth_lag_4',
         'L3_CLOUD_cloud_optical_depth_lag_5',
       'L3_CLOUD_cloud_optical_depth_lag_30',
         
       
          'relative_humidity_2m_above_ground_lag_1',
       'relative_humidity_2m_above_ground_lag_2',
       'relative_humidity_2m_above_ground_lag_3',
       
         'L3_AER_AI_sensor_altitude_lag_10', 
         'L3_AER_AI_sensor_altitude_lag_20',
       'L3_AER_AI_sensor_altitude_lag_30',
         
         'L3_NO2_tropopause_pressure_lag_1',
       'L3_NO2_tropopause_pressure_lag_2', 'L3_NO2_tropopause_pressure_lag_3',
         'L3_NO2_tropopause_pressure_lag_4','L3_NO2_tropopause_pressure_lag_30',
         

      

        

       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_sensor_altitude', 'L3_NO2_sensor_azimuth_angle',
       'L3_NO2_sensor_zenith_angle', 'L3_NO2_solar_azimuth_angle',
       'L3_NO2_solar_zenith_angle',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure',
       'L3_NO2_tropospheric_NO2_column_number_density',
       'L3_O3_O3_column_number_density', 'L3_O3_O3_effective_temperature',
       'L3_O3_cloud_fraction', 'L3_O3_sensor_azimuth_angle',
       'L3_O3_sensor_zenith_angle', 'L3_O3_solar_azimuth_angle',
       'L3_O3_solar_zenith_angle', 'L3_CO_CO_column_number_density',
       'L3_CO_H2O_column_number_density', 'L3_CO_cloud_height',
       'L3_CO_sensor_altitude', 'L3_CO_sensor_azimuth_angle',
       'L3_CO_sensor_zenith_angle', 'L3_CO_solar_azimuth_angle',
       'L3_CO_solar_zenith_angle', 'L3_HCHO_HCHO_slant_column_number_density',
       'L3_HCHO_cloud_fraction', 'L3_HCHO_sensor_azimuth_angle',
       'L3_HCHO_sensor_zenith_angle', 'L3_HCHO_solar_azimuth_angle',
       'L3_HCHO_solar_zenith_angle',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density_amf',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_base_pressure',
       'L3_CLOUD_cloud_fraction', 'L3_CLOUD_cloud_optical_depth',
       'L3_CLOUD_cloud_top_height', 'L3_CLOUD_cloud_top_pressure',
       'L3_CLOUD_sensor_azimuth_angle', 'L3_CLOUD_sensor_zenith_angle',
       'L3_CLOUD_solar_azimuth_angle', 'L3_CLOUD_solar_zenith_angle',
       'L3_CLOUD_surface_albedo', 'L3_AER_AI_absorbing_aerosol_index',
       'L3_AER_AI_sensor_altitude', 'L3_AER_AI_sensor_azimuth_angle',
       'L3_AER_AI_sensor_zenith_angle', 'L3_AER_AI_solar_azimuth_angle',
       'L3_AER_AI_solar_zenith_angle', 'L3_SO2_SO2_column_number_density',
       'L3_SO2_SO2_column_number_density_amf',
       'L3_SO2_SO2_slant_column_number_density',
       'L3_SO2_absorbing_aerosol_index', 'L3_SO2_cloud_fraction',
       'L3_SO2_sensor_azimuth_angle', 'L3_SO2_sensor_zenith_angle',
       'L3_SO2_solar_azimuth_angle', 'L3_SO2_solar_zenith_angle',
       'L3_CH4_sensor_zenith_angle', 'year', 'month', 'week', 'day',
       'total_days_month', 'total_days']

In [None]:
params = {
    'objective' :'regression',
    'learning_rate' : 0.02,
    'num_iterations': 30000,
    'max_bins': 50, 
    'max_depth' :7 ,
    'num_leaves' : 70,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'rmse' ,
     'min_data_in_leaf':5,
    'reg_lambda' :100
}


train_set = lgb.Dataset(train[columns], train['target'], silent=False,categorical_feature=['year','month','day','week'])
valid_set = lgb.Dataset(train[columns], train['target'], silent=False,categorical_feature=['year','month','day','week'])
model = lgb.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)

In [None]:
prediction = model.predict(test[columns])
prediction

### plotting feature importance

In [None]:
lgb.plot_importance(model, figsize=(20,40))
plt.show()

### Create Submission

In [None]:
#Submission
submit = pd.DataFrame({'Place_ID X Date': test['Place_ID X Date'], 'target': prediction})
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv" , index=False)
    return FileLink(submission_name+".csv")
create_submission(submit, 'submit')