In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [None]:
#package imports
from package.process_data import *
from package.model_hundler import *
from package.miscellaneous import *

# 1. Base Model Generator

## 1.1. Build History Data

In [None]:
claims_df = pd.read_csv("./data/claims.tsv", sep='\t')
prod_df = pd.read_csv("./data/production.tsv", sep='\t')
sub_path = "sub1"

#Filter to remove non matured data
claims_df = claims_df[claims_df['mdl_yr'] <= 2017]
prod_df = prod_df[prod_df['mdl_yr'] <= 2017]

#parameters
params = dict({"x_col": "milge",
                "x_max": 70_000,
                "x_step": 1_000,
                "y_col": "tis_wsd",
                "key": "vin_cd",
                "y_max": 70,
                "y_step": 1,
                "y_k": 15, #number of tis to be considered as input to cluster
                "x_k": 10_000, #number of milge to be considered as input to cluster
                })
                
#TODO: The data used is already constructed and normalized.
#Build History Data
print("Built History Data")
history_data = build_data(  clm_list=claims_df, 
                            prod_list = prod_df, 
                            cols_group = ["veh_line_cd","mdl_yr","prt_num_causl_base_cd"], 
                            params=params)

print("Remove Outliers")
history_data = sel_dist_window(df = history_data, #TODO: add insert noise function
                                f_col = "filter_",
                                z_max = [1.3,80], #z_max: base and top
                                params = params
                                )

print("cut top of the data: assuming that we don't know what happened after x_c and y_c")
history_data = cut_flattening_filters(df = history_data,
                                        params = params,
                                        filter_c = "filter_",
                                        dz_per = 0.25)

history_data.to_csv("./data/sel_acc_history_data.csv", index=False) #backup

#history_data = pd.read_csv("./data/acc_history_data.csv")


## 1.2. Check dataset

In [None]:
history_df_g = history_data.groupby(by="filter_")
indices = list(history_df_g.indices.keys())
for i in range(0, len(indices[-10:])):

    plot_df(data_df=history_df_g.get_group(indices[i]),
            x_col = params['x_col'], 
            y_col = params['y_col'], 
            z_col = "z", 
            title=indices[i], 
            w = 10,
            h = 6)

## 1.3. Process data : build cluster dataset and normalization

In [None]:
print("Build encoder")
#Normalization clusterezed
database = build_database(df = history_data,
                        filter_c_name = "filter_",
                        params = params,
                        uri = "./models/"+sub_path+"/database.json")

print("Norm and Encode by Cluster")
history_gnorm_df = norm_encode_bycluster(df = history_data,
                                        database = database,
                                        params = params,
                                        filter_c_name = "filter_")
   
export_params(uri="./models/"+sub_path+"/base_model_params.json", params=params)

#-----     Add noise -----------------------------------------
history_gnorm_df_copy = history_gnorm_df.copy()
per = 0.025 #percentage of variation
history_gnorm_df_copy.loc[:,"z"] = history_gnorm_df_copy["z"] + np.random.normal(per, per, len(history_gnorm_df_copy["z"]))
history_gnorm_df_copy = pd.concat([history_gnorm_df_copy, history_gnorm_df])

## 1.4. Fit base model

In [None]:
k_columns = nbits(np.shape(list(database['data_arr'].values()))[0])
columns = [params['x_col'], params['y_col'], 'z_max']+list(range(k_columns))

X, y = history_gnorm_df_copy[columns].values, history_gnorm_df_copy['z'].values

base_model = regression_model(k_columns+3) #x, y, {binary encoded columns}

history = fit_model(base_model,
                        X = X,
                        y = y,
                        batch_size = 128,
                        verbose = 1,
                        validation_data=None,
                        uri_model="./models/"+sub_path+"/base.h5",
                        patience = 25,
                        epochs=20_000
            )

## 1.5 Test Base Model

In [None]:
sub_path = "sub1"
model = tf.keras.models.load_model("./models/"+sub_path+"/base.h5")
params = json.loads(open("./models/"+sub_path+"/base_model_params.json", "r").read())
database = load_db("./models/"+sub_path+"/database.json")
pred_df = predict_regression(model, database, params,2)
plot_df(pred_df, x_col=params['x_col'], y_col=params['y_col'], z_col='z_pred', title='Group ', w = 10, h=6)