In [30]:
import time
import datetime

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

SEED = 260420010

##### NIŻEJ MUSIMY ZDEFINIOWAĆ W ZMIENNĄ `classification_target` TĘ **KATEGORYCZNE** KOLUMNE, KTÓRĄ BĘDZIEMY PRZYWIYWAĆ

In [31]:
classification_target = 'CAUSE'

In [32]:
df = pd.read_csv('../data/2_merged/merged_data.csv')
df = df.sample(frac=1, replace=False, random_state=SEED)
df['INSTALLATION_YEAR'] = df['INSTALLATION_YEAR'].apply(lambda s: int(s.split('-')[0]) if type(s) is str else s)

In [33]:
train_df = df.head(int(df.shape[0] * 0.6))
valid_df = df.tail(int(df.shape[0] * 0.2))
test_df = df.iloc[int(df.shape[0] * 0.6)+1: int(df.shape[0] * 0.8)]

In [34]:
nan_columns = df.columns[df.isna().any()].tolist()

if len(nan_columns) == 0:
        print("Nie ma wartości NaN w żadnej kolumnie.")
else:
    print("Kolumny zawierające wartości NaN:")
    print(nan_columns)

    # Obliczanie procentowego udziału wartości NaN w każdej kolumnie
    for col in nan_columns:
        nan_percentage = (df[col].isna().sum() / len(df)) * 100
        print(f"Procent NaN w kolumnie {col}: {nan_percentage:.2f}%")

Kolumny zawierające wartości NaN:
['INSTALLATION_YEAR', 'COMMODITY_RELEASED_TYPE', 'INTENTIONAL_RELEASE_BBLS', 'UNINTENTIONAL_RELEASE_BBLS', 'ON_OFF_SHORE', 'EXPLODE_IND', 'NUM_PUB_EVACUATED', 'FEDERAL', 'LOCATION_TYPE', 'CROSSING', 'ITEM_INVOLVED', 'MATERIAL_INVOLVED', 'EST_COST_OPER_PAID', 'EST_COST_GAS_RELEASED', 'EST_COST_PROP_DAMAGE', 'EST_COST_EMERGENCY', 'EST_COST_ENVIRONMENTAL', 'EST_COST_OTHER', 'NARRATIVE', 'SYSTEM_PART_INVOLVED', 'INCIDENT_AREA_TYPE', 'PIPE_FACILITY_TYPE', 'inst_age_in_days', 'RELEASE_TYPE', 'COULD_BE_HCA', 'ACCIDENT_PSIG', 'MOP_PSIG', 'PIPELINE_FUNCTION', 'SCADA_IN_PLACE_IND', 'INVESTIGATION_STATUS', 'EMPLOYEE_DRUG_TEST_IND', 'CONTRACTOR_DRUG_TEST_IND']
Procent NaN w kolumnie INSTALLATION_YEAR: 28.09%
Procent NaN w kolumnie COMMODITY_RELEASED_TYPE: 1.19%
Procent NaN w kolumnie INTENTIONAL_RELEASE_BBLS: 58.66%
Procent NaN w kolumnie UNINTENTIONAL_RELEASE_BBLS: 0.92%
Procent NaN w kolumnie ON_OFF_SHORE: 16.65%
Procent NaN w kolumnie EXPLODE_IND: 12.79%
Procen

## Uzupełnienie zmiennych

In [35]:
base_on = ['EST_COST_GAS_RELEASED', 'EST_COST_ENVIRONMENTAL', 'accident_pressure_as_%_mop_psig']

In [36]:
categorial_values = ['COMMODITY_RELEASED_TYPE', 'ON_OFF_SHORE', 'IGNITE_IND', 'EXPLODE_IND', 'FEDERAL', 'LOCATION_TYPE', 'CROSSING', 'MATERIAL_INVOLVED', 'INCIDENT_AREA_TYPE', 'PIPE_FACILITY_TYPE', 
                     'RELEASE_TYPE', 'COULD_BE_HCA', 'PIPELINE_FUNCTION', 'SCADA_IN_PLACE_IND', 'INVESTIGATION_STATUS', 'EMPLOYEE_DRUG_TEST_IND', 'CONTRACTOR_DRUG_TEST_IND', 'CAUSE']
categorial_values.remove(classification_target)

In [37]:
mean_values = ['UNINTENTIONAL_RELEASE_BBLS', 'MOP_PSIG', 'inst_age_in_days', 'EST_COST_PROP_DAMAGE', 'EST_COST_EMERGENCY', 'INSTALLATION_YEAR', 'ACCIDENT_PSIG']
zero_values = ['INTENTIONAL_RELEASE_BBLS', 'NUM_PUB_EVACUATED', 'EST_COST_OPER_PAID', 'EST_COST_OTHER']

In [38]:
without_NaN = ['ITEM_INVOLVED', 'SYSTEM_PART_INVOLVED']

In [39]:
delated = ['CAUSE_DETAILS', 'NARRATIVE', 'zone', 'TAVG']

In [40]:
def process_data_numeric(input_df: pd.DataFrame):
    for col_name in categorial_values:
        mode_value = input_df[col_name].mode()[0]
        input_df[col_name] = input_df[col_name].fillna(mode_value)

    for col_name in mean_values:
        mean_value = input_df[col_name].mean()
        input_df[col_name] = input_df[col_name].fillna(mean_value)

    for col_name in zero_values:
        input_df[col_name] = input_df[col_name].fillna(0)

    for col_name in base_on:
        mean_value = input_df[col_name].mean()
        input_df[col_name] = input_df[col_name].fillna(mean_value)
    return input_df

# One Hot Encode

In [41]:
one_hot_col = ["data_source", "COMMODITY_RELEASED_TYPE", "ON_OFF_SHORE", "LOCATION_TYPE", 
               "ITEM_INVOLVED", "MATERIAL_INVOLVED", "SYSTEM_PART_INVOLVED", 
               "INCIDENT_AREA_TYPE", "PIPE_FACILITY_TYPE", "PIPELINE_FUNCTION", "INVESTIGATION_STATUS",
               "zone", 'RELEASE_TYPE', 'CAUSE_DETAILS', 'CAUSE']
one_hot_col.remove(classification_target)

In [42]:
test = 0

for col in one_hot_col:
    unique_features = df[col].nunique()
    test += unique_features
    print(f"Ilość unikalnych cech w kolumnie '{col}': {unique_features}")
print(test - len(one_hot_col))

Ilość unikalnych cech w kolumnie 'data_source': 5
Ilość unikalnych cech w kolumnie 'COMMODITY_RELEASED_TYPE': 10
Ilość unikalnych cech w kolumnie 'ON_OFF_SHORE': 2
Ilość unikalnych cech w kolumnie 'LOCATION_TYPE': 7
Ilość unikalnych cech w kolumnie 'ITEM_INVOLVED': 31
Ilość unikalnych cech w kolumnie 'MATERIAL_INVOLVED': 10
Ilość unikalnych cech w kolumnie 'SYSTEM_PART_INVOLVED': 21
Ilość unikalnych cech w kolumnie 'INCIDENT_AREA_TYPE': 5
Ilość unikalnych cech w kolumnie 'PIPE_FACILITY_TYPE': 8
Ilość unikalnych cech w kolumnie 'PIPELINE_FUNCTION': 11
Ilość unikalnych cech w kolumnie 'INVESTIGATION_STATUS': 5
Ilość unikalnych cech w kolumnie 'zone': 8
Ilość unikalnych cech w kolumnie 'RELEASE_TYPE': 5
Ilość unikalnych cech w kolumnie 'CAUSE_DETAILS': 54
168


In [43]:
def one_hot_encode(input_dataframe: pd.DataFrame):
    for col in one_hot_col:
        one_hot_encoded = pd.get_dummies(input_dataframe[col])
        if input_dataframe[col].isna().any():
            input_dataframe[col + "_nan"] = input_dataframe[col].isna().astype(int)  # Zmiana typu danych na int
            one_hot_encoded.columns = [col + "_" + str(val) for val in one_hot_encoded.columns]
        else:
            one_hot_encoded.columns = [col + "_" + str(val) for val in one_hot_encoded.columns]

        # Konwersja kolumn one-hot encoded na int
        one_hot_encoded = one_hot_encoded.astype(int)

        input_dataframe = pd.concat([input_dataframe, one_hot_encoded], axis=1)
        input_dataframe.drop(col, axis=1, inplace=True)
    print("DataFrame po zakodowaniu one-hot:")
    return input_dataframe


In [44]:
train_df_classification = process_data_numeric(train_df)
valid_df_classification = process_data_numeric(valid_df)
test_df_classification = process_data_numeric(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mode_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mode_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mean_value)
A value is trying to be set 

In [45]:
train_df_classification['ds_type'] = 'train'
valid_df_classification['ds_type'] = 'valid'
test_df_classification['ds_type'] = 'test'
df_united_before_one_hot_classification = pd.concat([train_df_classification, valid_df_classification, test_df_classification],axis=0)
df_united_after_one_hot_classification = one_hot_encode(df_united_before_one_hot_classification)

DataFrame po zakodowaniu one-hot:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_classification['ds_type'] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df_classification['ds_type'] = 'valid'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_classification['ds_type'] = 'test'


In [46]:
train_df_classification = df_united_after_one_hot_classification.loc[df_united_after_one_hot_classification['ds_type'] == 'train']
valid_df_classification = df_united_after_one_hot_classification.loc[df_united_after_one_hot_classification['ds_type'] == 'valid']
test_df_classification = df_united_after_one_hot_classification.loc[df_united_after_one_hot_classification['ds_type'] == 'test']

In [47]:
nan_columns = train_df_classification.columns[train_df_classification.isna().any()].tolist()

if len(nan_columns) == 0:
        print("Nie ma wartości NaN w żadnej kolumnie.")
else:
    print("Kolumny zawierające wartości NaN:")
    print(nan_columns)

    # Obliczanie procentowego udziału wartości NaN w każdej kolumnie
    for col in nan_columns:
        nan_percentage = (train_df_classification[col].isna().sum() / len(train_df_classification)) * 100
        print(f"Procent NaN w kolumnie {col}: {nan_percentage:.2f}%")

Kolumny zawierające wartości NaN:
['NARRATIVE']
Procent NaN w kolumnie NARRATIVE: 0.47%


In [48]:
train_df_classification.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_train_classification.csv')
valid_df_classification.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_valid_classification.csv')
test_df_classification.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_test_classification.csv')
df_united_after_one_hot_classification.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_cause_classification.csv', index=True, index_label='case_idx')

##### REGRESSION DATA

In [49]:
categorial_values.append(classification_target)
one_hot_col.append(classification_target)
train_df_regression = process_data_numeric(train_df)
valid_df_regression = process_data_numeric(valid_df)
test_df_regression = process_data_numeric(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mode_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mode_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[col_name] = input_df[col_name].fillna(mean_value)
A value is trying to be set 

In [50]:
train_df_regression['ds_type'] = 'train'
valid_df_regression['ds_type'] = 'valid'
test_df_regression['ds_type'] = 'test'
df_united_before_one_hot_regression = pd.concat([train_df_regression, valid_df_regression, test_df_regression],axis=0)
df_united_after_one_hot_regression = one_hot_encode(df_united_before_one_hot_regression)

DataFrame po zakodowaniu one-hot:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_regression['ds_type'] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df_regression['ds_type'] = 'valid'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_regression['ds_type'] = 'test'


In [51]:
train_df_regression = df_united_after_one_hot_regression.loc[df_united_after_one_hot_regression['ds_type'] == 'train']
valid_df_regression = df_united_after_one_hot_regression.loc[df_united_after_one_hot_regression['ds_type'] == 'valid']
test_df_regression = df_united_after_one_hot_regression.loc[df_united_after_one_hot_regression['ds_type'] == 'test']

In [52]:
nan_columns = train_df_regression.columns[train_df_regression.isna().any()].tolist()

if len(nan_columns) == 0:
        print("Nie ma wartości NaN w żadnej kolumnie.")
else:
    print("Kolumny zawierające wartości NaN:")
    print(nan_columns)

    # Obliczanie procentowego udziału wartości NaN w każdej kolumnie
    for col in nan_columns:
        nan_percentage = (train_df_regression[col].isna().sum() / len(train_df_regression)) * 100
        print(f"Procent NaN w kolumnie {col}: {nan_percentage:.2f}%")

Kolumny zawierające wartości NaN:
['NARRATIVE']
Procent NaN w kolumnie NARRATIVE: 0.47%


In [53]:
train_df_regression.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_train_regression.csv', index=True, index_label='case_idx')
valid_df_regression.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_valid_regression.csv', index=True, index_label='case_idx')
test_df_regression.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_test_regression.csv', index=True, index_label='case_idx')
df_united_after_one_hot_regression.to_csv('../data/4_fill_nans->one_hot->merge/processed_data_cause_regression.csv', index=True, index_label='case_idx')

### FINISHING DATA PROCESSING

In [54]:
tf_idf_ds = pd.read_csv('../tf-idf.ivan.csv')
df_train_united_regression = train_df_regression.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')
df_valid_united_regression = valid_df_regression.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')
df_test_united_regression = test_df_regression.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')
df_train_united_classification = train_df_classification.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')
df_valid_united_classification = valid_df_classification.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')
df_test_united_classification = test_df_classification.join(tf_idf_ds, how='inner', on='case_idx',lsuffix='_left', rsuffix='_right')

In [55]:
df_train_united_regression = df_train_united_regression.drop(columns=['case_idx', 'NARRATIVE'])
df_test_united_regression = df_test_united_regression.drop(columns=['case_idx', 'NARRATIVE'])
df_valid_united_regression = df_valid_united_regression.drop(columns=['case_idx', 'NARRATIVE'])

df_train_united_classification = df_train_united_classification.drop(columns=['case_idx', 'NARRATIVE'])
df_test_united_classification = df_test_united_classification.drop(columns=['case_idx', 'NARRATIVE'])
df_valid_united_classification = df_valid_united_classification.drop(columns=['case_idx', 'NARRATIVE'])

In [56]:
def process_date(df: pd.DataFrame):
    df['case_date'] = df['case_date'].apply(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()) / 86400)
    
    # dodajemy roczną cykliczność naszym danym 
    df['case_date_sin'] = np.sin(2*np.pi * df['case_date'] / 365)
    return df

In [57]:
df_train_united_regression = process_date(df_train_united_regression)
df_valid_united_regression = process_date(df_valid_united_regression)
df_test_united_regression = process_date(df_test_united_regression)

df_train_united_classification = process_date(df_train_united_classification)
df_valid_united_classification = process_date(df_valid_united_classification)
df_test_united_classification = process_date(df_test_united_classification)

df_united_regression_all = pd.concat([df_train_united_regression, df_valid_united_regression, df_test_united_regression],axis=0)
df_united_classification_all = pd.concat([df_train_united_classification, df_valid_united_classification, df_test_united_classification],axis=0)

### Zrobione. Przezapiszmy dane

In [58]:
df_train_united_regression.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_train_regression.csv', index=True, index_label='case_idx')
df_valid_united_regression.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_valid_regression.csv', index=True, index_label='case_idx')
df_test_united_regression.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_test_regression.csv', index=True, index_label='case_idx')
df_united_regression_all.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_cause_regression.csv', index=True, index_label='case_idx')

df_train_united_classification.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_train_classification.csv')
df_valid_united_classification.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_valid_classification.csv')
df_test_united_classification.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_test_classification.csv')
df_united_classification_all.drop(columns=['ds_type']).to_csv('../data/4_fill_nans->one_hot->merge/processed_data_cause_classification.csv', index=True, index_label='case_idx')