In [None]:
import time
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
SEED = 260420010

In [None]:
df_train = pd.read_csv('NaN_OneHot/processed_data_train.csv')
df_valid = pd.read_csv('NaN_OneHot/processed_data_valid.csv')
df_test = pd.read_csv('NaN_OneHot/processed_data_test.csv')
tf_idf_ds = pd.read_csv('./tf-idf.ivan.csv')
df_train_united = df_train.join(tf_idf_ds, how='inner', on='Unnamed: 0',lsuffix='_left', rsuffix='_right')
df_valid_united = df_valid.join(tf_idf_ds, how='inner', on='Unnamed: 0',lsuffix='_left', rsuffix='_right')
df_test_united = df_test.join(tf_idf_ds, how='inner', on='Unnamed: 0',lsuffix='_left', rsuffix='_right')

In [None]:
df_train_united = df_train_united.drop(columns=['Unnamed: 0', 'Unnamed: 0_left', 'case_idx', 'NARRATIVE'])
df_test_united = df_test_united.drop(columns=['Unnamed: 0', 'Unnamed: 0_left', 'case_idx', 'NARRATIVE'])
df_valid_united = df_valid_united.drop(columns=['Unnamed: 0', 'Unnamed: 0_left', 'case_idx', 'NARRATIVE'])

In [None]:
df_train_united.nunique().sort_values(ascending=False)[500:].head(25) # 500 pierwszych kolumn to są PC z PCA kolumny NARRATIVE.
# df_test_united.nunique().sort_values(ascending=False)[500:].head(25) # 500 pierwszych kolumn to są PC z PCA kolumny NARRATIVE.
# df_valid_united.nunique().sort_values(ascending=False)[500:].head(25) # 500 pierwszych kolumn to są PC z PCA kolumny NARRATIVE.

narrative_tfidf-PC-278                                                                                                                   4856
case_lon                                                                                                                                 4703
case_lat                                                                                                                                 4695
inst_age_in_days                                                                                                                         3175
TAVG                                                                                                                                     3082
case_date                                                                                                                                2987
accident_pressure_as_%_mop_psig                                                                                                          2388
UNINTE

In [None]:
df_train_united.loc[:, df_train_united.dtypes == np.dtype('O')]

Unnamed: 0,case_date,INSTALLATION_YEAR
0,2021-02-20,2019-01-01
1,2011-01-04,1968-01-01
2,2017-12-13,2012-01-01
3,2011-03-03,
4,2013-06-14,1960-01-01
...,...,...
4875,2014-03-30,2006-01-01
4876,2019-04-20,2011-01-01
4877,2012-03-21,1957-01-01
4878,2010-07-30,2010-01-01


In [None]:
df_train_united['case_date'] = df_train_united['case_date'].apply(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()) / 86400)
df_valid_united['case_date'] = df_valid_united['case_date'].apply(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()) / 86400)
df_test_united['case_date'] = df_test_united['case_date'].apply(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()) / 86400)

In [None]:
df_train_united['case_date']

0       18677.958333
1       14977.958333
2       17512.958333
3       15035.958333
4       15869.916667
            ...     
4875    16158.958333
4876    18005.916667
4877    15419.958333
4878    14819.916667
4879    16073.958333
Name: case_date, Length: 4880, dtype: float64

dodajemy roczną cykliczność naszym danym 

In [None]:
df_train_united['case_date_sin'] = np.sin(2*np.pi * df_train_united['case_date'] / 365)
df_valid_united['case_date_sin'] = np.sin(2*np.pi * df_valid_united['case_date'] / 365)
df_test_united['case_date_sin'] = np.sin(2*np.pi * df_test_united['case_date'] / 365)

In [None]:
df_test_united.apply(lambda col: (col == np.inf).sum()).sort_values(ascending=False)

case_lat                  0
narrative_tfidf-PC-253    0
narrative_tfidf-PC-255    0
narrative_tfidf-PC-256    0
narrative_tfidf-PC-257    0
                         ..
narrative_tfidf-PC-19     0
narrative_tfidf-PC-20     0
narrative_tfidf-PC-21     0
narrative_tfidf-PC-22     0
case_date_sin             0
Length: 724, dtype: int64

In [None]:
df_valid_united.isna().sum().sort_values(ascending=False)

INSTALLATION_YEAR         426
ACCIDENT_PSIG             161
case_lat                    0
narrative_tfidf-PC-263      0
narrative_tfidf-PC-255      0
                         ... 
narrative_tfidf-PC-21       0
narrative_tfidf-PC-22       0
narrative_tfidf-PC-23       0
narrative_tfidf-PC-24       0
case_date_sin               0
Length: 724, dtype: int64

In [None]:
df_train_united[df_train_united['accident_pressure_as_%_mop_psig'] == np.inf] = 5.56553
df_valid_united[df_valid_united['accident_pressure_as_%_mop_psig'] == np.inf] = 5.56553
df_test_united[df_test_united['accident_pressure_as_%_mop_psig'] == np.inf] = 5.56553

In [None]:
df_train_united[df_train_united['accident_pressure_as_%_mop_psig'] == 5.56553] = df_train_united['accident_pressure_as_%_mop_psig'].max()
df_valid_united[df_valid_united['accident_pressure_as_%_mop_psig'] == 5.56553] = df_valid_united['accident_pressure_as_%_mop_psig'].max()
df_test_united[df_test_united['accident_pressure_as_%_mop_psig'] == 5.56553] = df_test_united['accident_pressure_as_%_mop_psig'].max()