In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from cuml.preprocessing import OneHotEncoder
import cudf
import cuml
import cupy as cp
import numpy as np

In [4]:
import pandas as pd
X = pd.read_csv("../input/nepal-earthquake-complete/complete_x.csv")
y = pd.read_csv("../input/nepal-earthquake-complete/complete_y.csv")

In [5]:
X = X.astype("float32")
y = y.astype("float32")
y.set_index("building_id", inplace = True)
y = y - 1
y.head()

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify= y, test_size= 0.2,
                                                   random_state= 7)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                      y_train, stratify= y_train, 
                                                      test_size= 0.2,
                                                     random_state= 7)

In [None]:
#Varying only the tree depth
train_accuracy = []
validation_accuracy = []
tree_depth = []
for depth in range(5, 15):
    
    model = cuml.ensemble.RandomForestClassifier(max_depth = depth, max_samples = 0.8,
                                        max_features = 0.6, random_state = 7 )
    
    model.fit(X_train, y_train)
    train_accuracy.append(model.score(X_train, y_train))
    validation_accuracy.append(model.score(X_valid, y_valid))
    tree_depth.append(depth)

score_df = pd.DataFrame({"train_accuracy":train_accuracy, 
                         "validation_accuracy": validation_accuracy})

score_df["Tree Depth"] = tree_depth
score_df.set_index("Tree Depth", inplace = True)

import matplotlib.pyplot as plt
score_df.plot()
plt.xlabel("Tree Depth")
plt.ylabel("Score")
plt.title("Tree Depth vs Score");

In [None]:
#Grid Search took a log time to train
#So, we had to use Randomized search instead
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

params = {"n_estimators": [10*k for k in range(3, 15)],
         "max_depth": [k for k in range(5, 12)],
         "max_samples": [0.1*k for k in range(7,11)],
         "max_features": [0.1*k for k in range(6,9)]}


clf = cuml.ensemble.RandomForestClassifier(random_state = 7)

grid = GridSearchCV(clf, params)
grid.fit(X,y)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
clf = cuml.ensemble.RandomForestClassifier(random_state = 7)
params = {"n_estimators": [10*k for k in range(3, 15)],
         "max_depth": [k for k in range(5, 12)],
         "max_samples": [0.1*k for k in range(7,11)],
         "max_features": [0.1*k for k in range(6,9)]}

rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = params, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
model = cuml.ensemble.RandomForestClassifier(n_estimators = 120, 
                                             max_depth = 11, max_samples = 0.8,
                                        max_features = 0.8, random_state = 7 )
model.fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
validation_accuracy = model.score(X_valid, y_valid)
print("The training accuracy is: ", train_accuracy)
print("The validation accuracy is: ", validation_accuracy)

It's not a good score. So, we will try different algorithms.

In [None]:
#KNN
knn = cuml.neighbors.KNeighborsClassifier(random_state = 7)
knn_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform'],
               'metric' : ['minkowski','euclidean','manhattan']}

rf_random = RandomizedSearchCV(estimator = knn, 
                               param_distributions = knn_params, 
                               n_iter = 30, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

The best value of "n_neighbors" parameter is 15 which is the highest value given. So, we have to explore further in this direction.

In [None]:
#KNN
knn = cuml.neighbors.KNeighborsClassifier(random_state = 7)
knn_params = { 'n_neighbors' : [13,15, 17, 19, 23, 25, 27, 29],
               'weights' : ['uniform'],
               'metric' : ['manhattan']}

rf_random = RandomizedSearchCV(estimator = knn, 
                               param_distributions = knn_params, 
                               n_iter = 8, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [None]:
knn_params = { 'n_neighbors' : [27],
               'weights' : ['uniform'],
               'metric' : ['manhattan']}

rf_random = RandomizedSearchCV(estimator = knn, 
                               param_distributions = knn_params, 
                                cv = 10, verbose=2, 
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
train_accuracy = rf_random.score(X_train, y_train)
validation_accuracy = rf_random.score(X_valid, y_valid)
print("The training accuracy is: ", train_accuracy)
print("The validation accuracy is: ", validation_accuracy)

In [None]:
#Retraining the model using best parameters
model = knn = cuml.neighbors.KNeighborsClassifier(n_neighbors = 27,
                                                  weights= "uniform",
                                                  metric = "manhattan",
                                                  random_state = 7)
model.fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
validation_accuracy = model.score(X_valid, y_valid)
print("The training accuracy is: ", train_accuracy)
print("The validation accuracy is: ", validation_accuracy)

So, KNN does a little better.

Now we will use Support Vector Classifier.

In [None]:
#Support vector classifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
svc = cuml.svm.SVC(random_state = 7)
svc_params = { "multiclass_strategy": ["ovo"],
         "degree" : [1],
         "kernel": ["linear"],
         "C": [0.1],
         "gamma": [0.1]}

rf_random = RandomizedSearchCV(estimator = svc, 
                               param_distributions = svc_params, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)
X_train_scaled = StandardScaler().fit_transform(X_train)
rf_random.fit(X_train_scaled, y_train)
rf_random.best_params_

In [17]:
import xgboost as xgb

dtrain = xgb.DMatrix(data = X_train, label = y_train)
dvalid = xgb.DMatrix(data = X_valid, label = y_valid)

params = {
            'objective': 'multi:softmax', 
            'tree_method': 'gpu_hist', 
            'max_depth': 8,
            'subsample':0.88,
            'colsample_bytree': 0.5,
            'gamma':1.5,
            'min_child_weight':8,
            'lambda':70,
            'eta':0.03,
            'num_class': 5
            
    }

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

#Training model
bst = xgb.train(params, dtrain=dtrain,
                num_boost_round=10,evals=watchlist,
                early_stopping_rounds=1, maximize=True,
                verbose_eval=1)
print('best ntree_limit:', bst.best_ntree_limit)
print('best score:', bst.best_score)

In [14]:
bst.score(X_valid, y_valid)

In [42]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
xgb_params = {
            'objective': ['multi:softmax'], 
            'tree_method': ['gpu_hist'], 
            'max_depth': [10],
            'subsample': [0.9],
            'colsample_bytree': [0.6],
            'gamma':[0.1],
            'min_child_weight':[2],
            'lambda':[30],
            'eta':[0.3],
            'num_boost_round' : [200],
            'num_class': [5]
    }
model_xgb = xgb.XGBClassifier(random_state = 7 )

rf_random = RandomizedSearchCV(estimator = model_xgb, 
                               param_distributions = xgb_params, 
                               n_iter = 5, cv = 3, verbose=50, 
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)


In [40]:
rf_random.best_params_

In [43]:
train_accuracy = rf_random.score(X_train, y_train)
validation_accuracy = rf_random.score(X_valid, y_valid)
print("The training accuracy is: ", train_accuracy)
print("The validation accuracy is: ", validation_accuracy)