In [266]:
import pandas as pd
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from xgboost import XGBClassifier

import time
import tools


In [267]:
def evaluate_model(model, X, y, scoring, cv, n_jobs, n_splits=10, n_repeats=3, random_state=1, error_score='raise'):
    
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
    n_scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=n_jobs, error_score=error_score)

    return mean(n_scores), std(n_scores)

def evaluate_model_complex(model, X, y, scoring, cv, n_jobs, n_splits=10, n_repeats=3, random_state=1):
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv, n_jobs=n_jobs)

    return scores

In [268]:
try:
    raw = pd.read_csv("W:/Shared With Me/Gibraltar/OTHER/AI Crowd Competition/Data/training.csv")
except:
    raw = pd.read_csv("C:/Users/Dev Work/Documents/aicrowd/insurancepricing/python/training.csv")

In [269]:
data = raw.copy()
data = tools.clean(data)
data = data[data['year'] == 1]

y = [1 if row > 0 else 0 for row in data['claim_amount']]

exclude = ['claim_amount', 'id_policy', 'pol_coverage', 'pol_payd', 'pol_pay_freq', 'pol_usage', 'vh_fuel', 'vh_make_model', 'vh_type', 'drv_sex1', 'drv_drv2', 'drv_age2', 'drv_lic2', 'drv_sex2']
x = data[data.columns.difference(exclude)]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [272]:
n_trees = [10,50,100,500,1000,2000,5000]
max_depth = [x for x in range(1,11)]
learning_rate = [0.2, 0.4, 0.6,0.8,1]

In [273]:
'''
Warning, this cell will take a very, very long time to run without a decent PC.

I'd recommend not running on laptops...

'''

results = {}

names = list()

start = time.time()

for n in n_trees:
    depths = {}
    for i in max_depth:
        results = {}
        for r in learning_rate:
            xg = XGBClassifier(n_estimators=n, max_depth=i, eta=r, eval_metric='logloss')
            xg.fit(x_train, y_train)

            scores = evaluate_model_complex(xg, x_test, y_test, 'accuracy', cv, -1)
            
            results[r] = {"scores":scores, "model":xg.copy()}

        depths[i] = results.copy()
    trees[n] = depths.copy()

end = time.time()

print(f"Time Taken: {end-start}")

KeyboardInterrupt: 

In [None]:
rows = []

for tree in trees:
    for depth in trees[tree]:
        for rate in trees[tree][depth]:
            rows.append((tree, depth, rate, mean(trees[tree][depth][rate]['scores'])))

graph_data = pd.DataFrame(rows, columns=['trees', 'depth', 'rate', 'mean'])


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_zscale='log'
ax.set_xscale='log'
ax.scatter(graph_data['trees'].astype(float), graph_data['depth'].astype(float), c=graph_data['mean'])

In [276]:
n

5000

In [277]:
i

8