### CodeIT Suisse Submission Notebook

In [0]:
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Function for splitting training and test set
from sklearn.model_selection import train_test_split

# Libraries to perform hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

# Import classes for ML Models
import xgboost as xgb


# Evaluation Metrics
#from sklearn.metrics import mean_squared_error as mse
#from sklearn.metrics import r2_score
#from sklearn.metrics import mean_absolute_error as mae

# To save the final model on disk
from sklearn.externals import joblib  ## Reference http://scikit-learn.org/stable/modules/model_persistence.html

In [0]:
df = pd.read_csv('.../train.csv')

In [0]:
X_tt_test = pd.read_csv('.../test.csv')

In [0]:
X_t_test = pd.read_csv('.../test.csv')

In [0]:
# Create separate object for target variable
y = df.bestSoldierPerc
# Create separate object for input features
X = df.drop(columns = ['soldierId','shipId','attackId','bestSoldierPerc'], axis=1)

In [0]:
s = StandardScaler()
X = s.fit_transform(X)

In [0]:
X_t_test = X_t_test.drop(columns = ['soldierId','shipId','attackId','Unnamed: 0', 'index'], axis=1)

In [0]:
X_t_test = s.transform(X_t_test)

In [0]:
# Split X and y into train and test sets: 80-20 (Time based splitting)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False)

In [0]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Ridge Regression

In [0]:
tuned_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
model = GridSearchCV(Ridge(), tuned_params, scoring = 'neg_mean_absolute_error', cv=10, n_jobs=-1)
model.fit(X_train, y_train)

In [0]:
model.best_estimator_

In [0]:
## Predict Train results
y_train_pred = model.predict(X_train)

In [0]:
## Predict Test results
y_pred = model.predict(X_test)

In [0]:
y_test_pred = model.predict(X_t_test)

## XGBoost

In [0]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", booster = "gbtree",eta = 0.02,
                            max_depth           = 12, #changed from default of 8
                            subsample           = 0.9, # 0.7
                            colsample_bytree    = 0.7,
                            learning_rate = 0.01) # 0.7)

In [0]:
xgb_model.fit(X, y)

In [0]:
y_test_pred = xgb_model.predict(X_t_test)

In [0]:
joblib.dump(model, 'model_pickle.sav')
files.download('model_pickle.sav')

## XGBoost RandomSearchCV

In [0]:
from scipy.stats import uniform, randint

In [0]:
xgb_model = xgb.XGBRegressor(objective="reg:linear")

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

model = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=10, cv=3, verbose=1, n_jobs=-1, return_train_score=True)

In [0]:
model.fit(X,y)

In [0]:
y_test_pred = model.predict(X_t_test)

## LGBM

In [0]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [0]:
import lightgbm as lgb
clf = lgb.LGBMRegressor(max_depth=-1, silent=True, metric='neg_mean_absolute_error', n_jobs=-1, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=10,
    scoring='neg_mean_absolute_error',
    cv=3,
    refit=True,
    verbose=True)

In [0]:
gs.fit(X,y)

In [0]:
joblib.dump(gs, 'gbm_model_pickle.sav')

In [0]:
y_test_pred = gs.predict(X_t_test)

## Final Submission

In [0]:
df_final = pd.DataFrame()

In [0]:
df_final['soldierId'] = X_tt_test['soldierId'].astype(int)

In [0]:
df_final['bestSoldierPerc'] = pd.DataFrame(y_test_pred)

In [0]:
df_final.head()

In [0]:
df_final.to_csv('submission1.csv',index = False)