In [177]:
import time
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
SEED = 26042001

In [4]:
df = pd.read_csv('NaN_OneHot/processed_data.csv')
tf_idf_ds = pd.read_csv('data/processed_data/tf-idf.ivan.csv')
df_united = df.drop(columns=['NARRATIVE']).join(tf_idf_ds).drop(columns=['Unnamed: 0', 'case_idx.1', 'case_idx'])

In [6]:
df_united['TAVG'] = df_united['TAVG'].fillna(df_united['TAVG'].mean())

In [12]:
df_united.nunique().sort_values(ascending=False)[500:].head(25) # 500 pierwszych kolumn to są PC z PCA kolumny NARRATIVE.

case_lon                                             7694
case_lat                                             7681
inst_age_in_days                                     4985
TAVG                                                 3903
case_date                                            3807
accident_pressure_as_%_mop_psig                      3663
UNINTENTIONAL_RELEASE_BBLS                           2997
EST_COST_PROP_DAMAGE                                 2480
EST_COST_EMERGENCY                                   2021
EST_COST_GAS_RELEASED                                1212
EST_COST_ENVIRONMENTAL                               1018
MOP_PSIG                                              981
INTENTIONAL_RELEASE_BBLS                              837
EST_COST_OTHER                                        513
EST_COST_OPER_PAID                                    490
NUM_PUB_EVACUATED                                     105
CAUSE_DETAILS                                          54
INJURE        

In [62]:
df_united.loc[:, df_united.dtypes == np.dtype('O')]

Unnamed: 0,case_date,CAUSE_DETAILS,RELEASE_TYPE
0,2022-12-31,EXTERNAL CORROSION,LEAK
1,2022-12-30,THREADED CONNECTION/COUPLING FAILURE,LEAK
2,2022-12-26,NON-THREADED CONNECTION FAILURE,LEAK
3,2022-12-26,OTHER EQUIPMENT FAILURE,LEAK
4,2022-12-25,"FAILURE OF EQUIPMENT BODY (EXCEPT PUMP), TANK ...",LEAK
...,...,...,...
8129,2022-08-17,EXCAVATION DAMAGE BY THIRD PARTY,OTHER
8130,2022-08-09,EQUIPMENT NOT INSTALLED PROPERLY,LEAK
8131,2022-06-06,INTERNAL CORROSION,RUPTURE
8132,2022-05-25,"DESIGN-, CONSTRUCTION-, INSTALLATION-, OR FABR...",OTHER


In [72]:
df_united = df_united.join(pd.get_dummies(df_united.loc[:, ['RELEASE_TYPE', 'CAUSE_DETAILS']], dtype=int))

In [74]:
df_united = df_united.drop(columns=['RELEASE_TYPE', 'CAUSE_DETAILS'])

In [85]:
df_united.loc[:, df_united.dtypes == np.dtype('O')]

Unnamed: 0,case_date
0,2022-12-31
1,2022-12-30
2,2022-12-26
3,2022-12-26
4,2022-12-25
...,...
8129,2022-08-17
8130,2022-08-09
8131,2022-06-06
8132,2022-05-25


In [94]:
df_united['case_date'] = df_united['case_date'].apply(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()) / 86400)

In [95]:
df_united['case_date']

0       19356.958333
1       19355.958333
2       19351.958333
3       19351.958333
4       19350.958333
            ...     
8129    19220.916667
8130    19212.916667
8131    19148.916667
8132    19136.916667
8133    19128.916667
Name: case_date, Length: 8134, dtype: float64

dodajemy roczną cykliczność naszym danym 

In [96]:
df_united['case_date_sin'] = np.sin(2*np.pi * df_united['case_date'] / 365)

In [106]:
df_united.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [130]:
df_united.apply(lambda col: (col == np.inf).sum()).sort_values(ascending=False)

case_lat                  0
narrative_tfidf-PC-312    0
narrative_tfidf-PC-314    0
narrative_tfidf-PC-315    0
narrative_tfidf-PC-316    0
                         ..
narrative_tfidf-PC-80     0
narrative_tfidf-PC-81     0
narrative_tfidf-PC-82     0
narrative_tfidf-PC-83     0
case_date_sin             0
Length: 722, dtype: int64

In [125]:
df_united[df_united['accident_pressure_as_%_mop_psig'] == np.inf] = 5.56553

In [129]:
df_united[df_united['accident_pressure_as_%_mop_psig'] == 5.56553] = df_united['accident_pressure_as_%_mop_psig'].max()

**DANE ZOSTAŁY PRZETWARZONE!**

Będziemy przywidywać `EST_COST_PROP_DAMAGE`, czyli oszacowany koszt szkód majątkowych. Oczywiście to jest zadanie typu regressii. </br> 
Najpierw rozdzielimy nasze dane i weźmiemy stosunek train/test/valid = 0.6/0.2/0.2.   

In [171]:
X, y = df_united.drop(columns=['inst_age_in_days']), df_united['inst_age_in_days']
X_train, X_test_valid_tmp, y_train, y_test_valid_tmp = train_test_split(X, y, train_size = 0.6, random_state=SEED)
X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid_tmp, y_test_valid_tmp, train_size = 0.5, random_state=SEED)

Określijmy sobie poziom odniesienia jako "zawsze przywidujemy średnią" oraz załóżmy RMSE oraz R2 jako metrykę dla naszego regressora. <br/>

In [172]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
y_dummy_pred = dummy.predict(X_valid)
print(f'R2 of a dummy regressor is {r2_score(y_valid, y_dummy_pred)}')
print(f'RMSE of a dummy regressor is {
      mean_squared_error(y_valid, y_dummy_pred) ** 0.5}')

R2 of a dummy regressor is -0.0024992206825686925
RMSE of a dummy regressor is 8330.20018741882


Zastosujemy regresję linijową jako high-bias/low-variance model. 

In [173]:
linear_regressor = LinearRegression()
lin_reg = linear_regressor.fit(X_train, y_train)
y_lin_reg_pred = lin_reg.predict(X_valid)
print(f'R2 of a linear regressor is {r2_score(y_valid, y_lin_reg_pred)}')
print(f'RMSE of a linear regressor is {
      mean_squared_error(y_valid, y_lin_reg_pred) ** 0.5}')

R2 of a linear regressor is -69750433.97875652
RMSE of a linear regressor is 69484330.38397329


Zastosujemy regresję typu low bias/high variance poprzez zastosowanie drzew decyzyjnych.

In [174]:
regression_tree = DecisionTreeRegressor()
regression_tree.fit(X_train, y_train)
y_tree_pred = regression_tree.predict(X_valid)
print(f'R2 of a tree regressor is {r2_score(y_valid, y_tree_pred)}')
print(f'RMSE of a dummy regressor is {
      mean_squared_error(y_valid, y_tree_pred) ** 0.5}')

R2 of a tree regressor is -0.048315353133611794
RMSE of a dummy regressor is 8518.42667846944


To już nie aż tak źle. Sprobujmy znaleźć hyperparametry.

In [178]:
regression_tree = DecisionTreeRegressor()
params = {'min_samples_leaf': np.arange()}
GridSearchCV(regression_tree, {})
regression_tree.fit(X_train, y_train)
y_tree_pred = regression_tree.predict(X_valid)
print(f'R2 of a tree regressor is {r2_score(y_valid, y_tree_pred)}')
print(f'RMSE of a dummy regressor is {
      mean_squared_error(y_valid, y_tree_pred) ** 0.5}')

SyntaxError: unmatched '}' (830145986.py, line 2)

In [175]:
from catboost import CatBoostRegressor
cb_regression = CatBoostRegressor(loss_function='RMSE')
cb_regression.fit(X_train, y_train, verbose=100)

Learning rate set to 0.052596
0:	learn: 8275.7432486	total: 33.6ms	remaining: 33.6s
100:	learn: 5806.7930273	total: 1.56s	remaining: 13.9s
200:	learn: 5110.9191694	total: 3.1s	remaining: 12.3s
300:	learn: 4352.1037486	total: 4.77s	remaining: 11.1s
400:	learn: 3724.6075441	total: 6.3s	remaining: 9.41s
500:	learn: 3197.7115968	total: 7.88s	remaining: 7.85s
600:	learn: 2769.1010924	total: 9.49s	remaining: 6.3s
700:	learn: 2399.0714305	total: 11.1s	remaining: 4.75s
800:	learn: 2078.0217492	total: 12.8s	remaining: 3.17s
900:	learn: 1811.6126600	total: 14.3s	remaining: 1.57s
999:	learn: 1585.7002663	total: 15.8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x198c71220>

In [176]:
y_cb_pred = cb_regression.predict(X_valid)
print(f'R2 of a catboost regressor is {r2_score(y_valid, y_cb_pred)}')
print(f'RMSE of a catboost regressor is {
      mean_squared_error(y_valid, y_cb_pred) ** 0.5}')

R2 of a catboost regressor is 0.47620939767972326
RMSE of a catboost regressor is 6021.327755362033
