### Download the data

In [None]:
!gdown --id dataset_id --output train.csv
!gdown --id dataset_id --output test.csv

### Import package

In [None]:
%pip install -U scikit-learn
%pip install xgboost

In [3]:
import os
import math
import csv

import calendar
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

from scipy.stats import uniform, randint

from xgboost.sklearn import XGBClassifier

### Configuration

In [13]:
config = {
    'all_feature' : False,
    'train_data' : "./train.csv",
    'test_data' : "./test.csv",
    'feature_selected' : [0, 2, 3, 4, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
}

In [5]:
def feature_select(x_train, x_test):
  if(config['all_feature']):
    return x_train, x_test
  else:
    select_idx = config['feature_selected']
    return x_train[:, select_idx], x_test[:, select_idx]

def date_info_processing(data):
  for i in range(len(data)):
    current_day = datetime.strptime(data[i][0], '%Y-%m-%d')
    current_day_pos = current_day.timetuple().tm_yday
    day_count_of_year = 366 if calendar.isleap(current_day.year) else 365
    data[i][0] = float(current_day_pos / day_count_of_year)

  return data

def drop_features(data, features_to_drop):
  data.drop(columns=features_to_drop, axis = 1, inplace = True)

  return data


### Data Preprocessing Function

In [14]:
def array_threshold(tar: np.ndarray):
  mask = abs(tar) < 0.00001
  tar[mask] = 0

  return tar

def date_processing(data: pd.DataFrame):
  tar_pos = data.columns.get_loc('Attribute1')

  data['Attribute1'] = pd.to_datetime(data['Attribute1'])

  month_df = data['Attribute1'].dt.month
  month = data['Attribute1'].dt.month.values.tolist()

  data.drop('Attribute1', axis=1, inplace=True)
  data_len = len(data)
  zero_list = [0] * data_len
  data.insert(loc=tar_pos + 0, column='Month', value=zero_list)
  data.insert(loc=tar_pos + 1, column='MonthSin', value=zero_list)
  data.insert(loc=tar_pos + 2, column='MonthCos', value=zero_list)

  month_sin_arr = np.sin(2 * np.pi * month_df / 12)
  month_cos_arr = np.cos(2 * np.pi * month_df / 12)

  month_sin_arr = array_threshold(month_sin_arr)
  month_cos_arr = array_threshold(month_cos_arr)

  data['Month'] = month
  data['MonthSin'] = month_sin_arr
  data['MonthCos'] = month_cos_arr

  return data

def get_season(month):
  if 3 <= month <= 5:
      return 0
  elif 6 <= month <= 8:
      return 1
  elif 9 <= month <= 11:
      return 2
  else:
      return 3

def season_mapping(data: pd.DataFrame):
  tar_pos = data.columns.get_loc('Attribute20')

  month_df = data['Month']
  season = data['Month'].apply(get_season)
  data.insert(loc=tar_pos + 1, column='Season', value=season)

  return data

def binary_processing(data: pd.DataFrame, is_test):
  data['Attribute20'] = data['Attribute20'].apply(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

  if not is_test:
      data['Attribute21'] = data['Attribute21'].apply(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

  return data

def direction_processing(data: pd.DataFrame):
  # Mapping every direction Info a vector info the length of the vector is 1 and the vector can represent the angle info
  degrees = {"N": 0, "NNE": 0, "NE": 0, "ENE": 0, "E": 0, "ESE": 0, "SE": 0, "SSE": 0,
          "S": 0, "SSW": 0, "SW": 0, "WSW": 0, "W": 0, "WNW": 0, "NW": 0, "NNW": 0}
  dirs = {"N": [], "NNE": [], "NE": [], "ENE": [], "E": [], "ESE": [], "SE": [], "SSE": [],
          "S": [], "SSW": [], "SW": [], "WSW": [], "W": [], "WNW": [], "NW": [], "NNW": []}
  degree = 0
  step = 22.5
  for dir in dirs:
    x_val = -math.cos(math.radians(degree))
    y_val = -math.sin(math.radians(degree))

    x_val = x_val if abs(x_val) > 0.2 else 0
    y_val = y_val if abs(y_val) > 0.2 else 0
    dirs[dir].append(x_val)
    dirs[dir].append(y_val)
    degrees[dir] = degree
    degree += step

  # Store the mapping result in the list
  new_x1_feature = []
  new_y1_feature = []
  new_x2_feature = []
  new_y2_feature = []
  new_diff_feature = []

  for i in range(len(data['Attribute8'])):
    dir1 = data['Attribute8'][i]
    if (dir1 is not np.NAN):
        new_x1_feature.append(dirs[dir1][0])
        new_y1_feature.append(dirs[dir1][1])
    else:
        new_x1_feature.append(np.NAN)
        new_y1_feature.append(np.NAN)
    dir2 = data['Attribute9'][i]
    if (dir2 is not np.NAN):
        new_x2_feature.append(dirs[dir2][0])
        new_y2_feature.append(dirs[dir2][1])
    else:
        new_x2_feature.append(np.NAN)
        new_y2_feature.append(np.NAN)

  tar_pos = data.columns.get_loc('Attribute8')
  data.drop(['Attribute8', 'Attribute9'], axis=1, inplace=True)
  data.insert(loc=tar_pos, column='Attribute8_x', value=new_x1_feature)
  data.insert(loc=tar_pos + 1, column='Attribute8_y', value=new_y1_feature)
  data.insert(loc=tar_pos + 2, column='Attribute9_x', value=new_x2_feature)
  data.insert(loc=tar_pos + 3, column='Attribute9_y', value=new_y2_feature)

  return data

def relative_humidity_to_absolute_humidity(relative_humidity, temperature, atmospheric_pressure):

  saturation_vapor_pressure = 6.11 * 10**((7.5 * temperature) / (237.7 + temperature))
  actual_vapor_pressure = (relative_humidity / 100) * saturation_vapor_pressure
  absolute_humidity = (actual_vapor_pressure * 1000) / (0.622 * atmospheric_pressure)

  return absolute_humidity

def mositure_difference(data: pd.DataFrame):
  mositure_am = data['Attribute12']
  mositure_pm = data['Attribute13']
  temperature_am = data['Attribute18']
  temperature_pm = data['Attribute19']
  pressure_am = data['Attribute14']
  pressure_pm = data['Attribute15']

  mositure_am = relative_humidity_to_absolute_humidity(mositure_am, temperature_am, pressure_am)
  mositure_pm = relative_humidity_to_absolute_humidity(mositure_pm, temperature_pm, pressure_pm)

  mositure_diff = mositure_pm - mositure_am;

  return mositure_diff

def wind_combination(data: pd.DataFrame):
  data['Attribute8_x'] = data['Attribute10'] * data['Attribute8_x']
  data['Attribute8_y'] = data['Attribute10'] * data['Attribute8_y']
  data['Attribute9_x'] = data['Attribute11'] * data['Attribute9_x']
  data['Attribute9_y'] = data['Attribute11'] * data['Attribute9_y']

  return data

def feature_difference(data: pd.DataFrame, interaction_features):
  interation = pd.DataFrame()

  for feature_pair in interaction_features:
    feature1, feature2, new_feature_name = feature_pair

    if(new_feature_name == 'Mositure_Difference'):
      interation[new_feature_name] = mositure_difference(data)
    else:
      interation[new_feature_name] = data[feature1] - data[feature2]

  tar_pos = data.columns.get_loc('Attribute20')

  for new_feature_name in interation.columns:
    tar_pos += 1
    data.insert(loc=tar_pos, column=new_feature_name, value=interation[new_feature_name])

  return data

def feature_interaction(data: pd.DataFrame, interaction_features):
  interation = pd.DataFrame()

  for feature_pair in interaction_features:
    feature1, feature2, new_feature_name = feature_pair
    interation[new_feature_name] = data[feature1] * data[feature2]

  tar_pos = data.columns.get_loc('Attribute20')

  for new_feature_name in interation.columns:
    tar_pos += 1
    data.insert(loc=tar_pos, column=new_feature_name, value=interation[new_feature_name])

  return data

### Feature Relation

In [16]:
feature_diff = [
    ('Attribute4', 'Attribute3', 'Day_Temperature_Difference'),
    ('Attribute19', 'Attribute18', 'Time_Temperature_Difference'),
    ('Attribute11', 'Attribute10', 'Wind_Speed_Difference'),
    ('Attribute13', 'Attribute12', 'Mositure_Difference'),
    ('Attribute15', 'Attribute14', 'Pressure_Difference'),
    ('Attribute17', 'Attribute16', 'Cloud_Difference')
]

feature_multi = [
    ('Attribute13', 'Attribute19', 'Humidity_Temperature_Interact'),
    ('Attribute5', 'Attribute11', 'Rainfall_Wind_Speed_Interact'),
    ('Attribute5', 'Attribute9_x', 'Rainfall_Wind_Strength_X'),
    ('Attribute6', 'Attribute7', 'Evaporation_Sun')
]

feature_drop = [
    'Attribute10', 'Attribute11'
]

### Load Data & Preprocessing

In [None]:
train_data = pd.read_csv(config['train_data'])
test_data = pd.read_csv(config['test_data'])

train_data.dropna(subset=['Attribute8'])
train_data.dropna(subset=['Attribute9'])

# Date processing
train_data = date_processing(train_data)
test_data = date_processing(test_data)

# Date processing
train_data = season_mapping(train_data)
test_data = season_mapping(test_data)

# Binary processing
train_data = binary_processing(train_data, False)
test_data = binary_processing(test_data, True)

# Direction processing
train_data = direction_processing(train_data)
test_data = direction_processing(test_data)

# Interpolate the data
train_data.interpolate(method='linear', inplace=True)
train_data.dropna(axis=0, how='any', inplace=True)

# Multiply the wind vector by the wind speed
train_data = wind_combination(train_data)
test_data = wind_combination(test_data)

# Create the new feature by calculating the difference between feature values
train_data = feature_difference(train_data, feature_diff)
test_data = feature_difference(test_data, feature_diff)

# Create the new feature by multiplying the feature values.
train_data = feature_interaction(train_data, feature_multi)
test_data = feature_interaction(test_data, feature_multi)

# Drop the feature did not need
train_data = drop_features(train_data, feature_drop)
test_data = drop_features(test_data, feature_drop)


# Balance the data
train_data = train_data.groupby('Attribute21')
no = train_data.get_group(0)
yes = train_data.get_group(1)
no = no.sample(len(yes))
train_data = pd.concat([yes, no], axis = 0)
train_data = shuffle(train_data)


# Fetch the feature name after series of operation
feature_names = train_data.columns.tolist()

# Truning Data to NumPy array
train_data = train_data.values
test_data = test_data.values


# Spliting train_data into train_data and label
x_train = train_data[:,:-1]
y_train = train_data[:,-1]

x_train, x_test = feature_select(x_train, test_data)

x_train = x_train.astype(float)
x_test = x_test.astype(float)

y_train = y_train[:, np.newaxis]
y_train = y_train.astype(int)

y_train.squeeze(1)

## Find the proper superparamer

In [None]:
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [250, 300, 350],
    'max_depth': [5, 7, 9, 11],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.65, 0.7],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'gamma': [0.05, 0.1, 0.15],
    'scale_pos_weight': [1, 2, 3]
}

random_search = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=200, cv=4, random_state=0, scoring='accuracy', verbose = 1, n_jobs = -1)
random_search.fit(x_train, y_train)

best_params = random_search.best_params_
print(best_params)

best_model = random_search.best_estimator_

feature_importance = best_model.feature_importances_
predictions = best_model.predict(x_test)

# Store the reslut
id = [str(i) + '.0' for i in range(len(predictions))]

result = pd.DataFrame(predictions, index=id, columns=['ans'])
result.index.name = 'id'

result.to_csv('result.csv')

### Store the Feature Importance in to csv file to select proper features

In [23]:
# Stroe the feature importance to csv file
data_feature_names = list(enumerate(feature_names))
data_feature_names = data_feature_names[:-1]

selected_idx = config['feature_selected']
if(config['all_feature'] == False):
  data_feature_names = [data_feature_names[i] for i in selected_idx]

indices, names = zip(*data_feature_names)

feature_df = pd.DataFrame({'Index': indices, 'FeatureName': names, 'Importance': feature_importance})

feature_df = feature_df.sort_values(by='Importance')
csv_file_path = 'feature_importance.csv'

feature_df.to_csv(csv_file_path, index=False)

### Utilize the discovered hyperparameters to enhance the predictive model

In [24]:
params = {'subsample': 0.7, 'scale_pos_weight': 1, 'n_estimators': 250, 'min_child_weight': 5,
      'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.05, 'colsample_bytree': 0.7,  'random_state': 0}

model = XGBClassifier(**params)

model.fit(x_train, y_train)

predictions = model.predict(x_test)

id = [str(i) + '.0' for i in range(len(predictions))]

result = pd.DataFrame(predictions, index=id, columns=['ans'])
result.index.name = 'id'

result.to_csv('result.csv')