In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import numpy as np
import missingno

from IPython.display import display

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from category_encoders import BinaryEncoder

# 1. Wrangling for models

## 1.1 Model v1
X -> time series from tca_max (tca=7) to tca=2 for each event. 
(3d array: events, timestep, features)

y -> minimum value from tca=2 to tca_min (tca~0) for each event.
(1d array)

### Input

In [None]:
df = pd.read_csv("train_data.csv")
timestep = 17 #from 1 to 23 (17 with the current NaN strategy)
fill_X = -1
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))
df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns

X = np.zeros((events,timestep,features))
X.fill(fill_X)

i = 0
def df_to_3darray(event):
    global X, i
    #Transforming an event to time series (1,timesteps, columns)
    row = event.values.reshape(1,event.shape[0],event.shape[1])
    #Condition is needed to slice arrays correctly
    #Condition -> is timestep greater than the event's time series length? 
    if(timestep>=row.shape[1]):
        X[i:i+1,-row.shape[1]:,:] = row
    else:
        X[i:i+1,:,:] = row[:,-timestep:,:]
    #index to iterate over X array
    i = i + 1
    #dataframe remains intact, while X array has been filled.
    return event

df.groupby("event_id").apply(df_to_3darray)

#Dropping event_id to remove noise
X = X[:,:,1:]

#TODO: Padding with specific values column-wise instead of zeros.
#TODO: Separating time dependent and independent feature in 2 X arrays

print(X.shape, y.shape)

### Output

In [None]:
X, y, X_scaler, y_scaler, df

## 1.1 Model v2
X -> Overlapped windows of size timestep_X

y -> Overlapped windows of size timestep_y corresponding to the values after window timestep_X

### Input

In [None]:
df = pd.read_csv("train_data.csv")
timestep_X = 3 #from 1 to 17 -> lower values give more data
timestep_y = 1 #timestep to predict
fill_X = -1
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling data
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #for later use for scaling prediction
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Transforming dataframe into a 3D-array with overlapping windows
data = []
timestep = timestep_X + timestep_y
def df_to_3darray(event):
    global data
    event = event.values
    if(timestep>=event.shape[0]):
        pad_shape = timestep + 1
        pad_event = np.zeros((pad_shape,event.shape[1]))
        pad_event.fill(fill_X)
        pad_event[-event.shape[0]:,:] = event
        event = pad_event
    for i in range(event.shape[0]-timestep):
        data.append(event[i:i+timestep,:])
    return event

df.groupby("event_id").apply(df_to_3darray)
data = np.array(data)

#Dividing X and y
target_feture = list(df.columns).index("risk")
X, y = data[:,:timestep_X,:], data[:,timestep_X:,target_feture]

#Dropping event_id to remove noise
X = X[:,:,1:]


#TODO: Padding with specific values column-wise instead of zeros.
#TODO: Separating time dependent and independent feature in 2 X arrays

print(X.shape, y.shape)

### Output

In [None]:
X, y, X_scaler, y_scaler, df

# 2. Wrangling for feature selection

## 2.1 Feature Selection v1
Description: Emulating Model v1 w/o time series (3D-array)

X -> Dataframe with data from tca_max (tca=7) to tca=2. 
(2d df: instances, features)

y -> minimum value from tca=2 to tca_min (tca~0) for each event repeated for all instances of each event.
(1d array)

### Input

In [None]:
df = pd.read_csv("train_data.csv")
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Creating target feature as last risk value of an event
df["target-risk"] = df.groupby('event_id')['risk'].transform(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Slicing data (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Getting and scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(df["target-risk"].values.reshape(-1, 1))

#Getting and scaling X
X = df.drop(["event_id","target-risk"], axis=1)
X = pd.DataFrame(X_scaler.fit_transform(X), columns=X.columns)

print(X.shape, y.shape)

### Output

In [None]:
X, y, X_scaler, y_scaler

## 2.2 Feature Selection v2
Description: Minimum processing for predicting risk instance-wise instead of event-wise.

X -> Raw dataframe processed. (2D - df)

y -> Target feature (1d - array)

### Input

In [None]:
df = pd.read_csv("train_data.csv")
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Getting and scaling y
y = df["risk"].values.reshape(-1, 1)
y = y_scaler.fit_transform(y)

#Getting and Scaling X
X = df.drop(["event_id","risk"], axis=1)
X = pd.DataFrame(X_scaler.fit_transform(X), columns=X.columns)

print(X.shape, y.shape)

### Output

In [None]:
X, y, X_scaler, y_scaler

## 2.3 Feature Selection v3.1
THIS VERSION DOES NOT SUPPORT FILLING

Description: Time series analysis for each feature. Also valid for FFN

X ->  dataframe with shifted columns and rows sliced as model version 1

y ->  same as model version 1

### Input

In [None]:
df = pd.read_csv("train_data.csv")
features = ["risk"] #Array!! Time features that you want to analyze
timestep = 17 #from 1 to 23 (17 with the current NaN strategy)
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Shifting features
features.append('event_id')
X = pd.DataFrame()

for feature in features:
    for i in range(timestep-1,-1,-1):
        X[feature+'_t-'+str(i)] = df[feature].shift(i)
    
#Getting last row -> getting one row per event
X = X.groupby(["event_id_t-0"]).apply(lambda x: x.iloc[-1])
X = X.reset_index(drop=True)

#Deleting rows with more than one event_id
for i in range(timestep):
    X = X.loc[X["event_id_t-0"]==X["event_id_t-"+str(i)]]

#Dropping y values not included in X due to last condition
y = np.take(y, X.index)
y = y.reshape(-1, 1)

#Deleting event_id features
event_features = [feature for feature in list(X.columns) if feature.startswith('event_id_t-')]
X = X.drop(event_features, axis=1)

print(X.shape, y.shape)

#TODO: not dropping y values bc of X but filling X with zeros

### Output

In [None]:
X, y, X_scaler, y_scaler

## 2.4 Feature Selection v3.2
THIS VERSION SUPPORTS FILLING

Description: Time series analysis for each feature. Also valid for FFN.

X ->  dataframe with shifted columns and rows sliced as model version 1

y ->  same as model version 1

### Input

In [None]:
df = pd.read_csv("train_data.csv")
#features_ = ["risk"] #Array!! Time features that you want to analyze
timestep = 5 #from 1 to 23 (17 with the current NaN strategy)
fill_X = 0
X_scaler = StandardScaler()
y_scaler = StandardScaler()

### Data Processing

In [None]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y_original = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y_original)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns
X = np.zeros((events,timestep,features))
X.fill(fill_X)
i = 0

def df_to_3darray(event):
    global X, i
    #Transforming an event to time series (1,timesteps, columns)
    row = event.values.reshape(1,event.shape[0],event.shape[1])
    #Condition is needed to slice arrays correctly
    #Condition -> is timestep greater than the event's time series length? 
    if(timestep>=row.shape[1]):
        X[i:i+1,-row.shape[1]:,:] = row
    else:
        X[i:i+1,:,:] = row[:,-timestep:,:]
    #index to iterate over X array
    i = i + 1
    #dataframe remains intact, while X array has been filled.
    return event

df.groupby("event_id").apply(df_to_3darray)

#Dropping event_id to remove noise
X = X[:,:,1:]

#Reshaping again to 2D array but now events are filled
X = X.reshape(X.shape[0], timestep*X.shape[2])

#Naming shifted columns
shifted_columns = []
original_columns = list(df.columns)[1:] #Dropping event_id

for i in range(timestep-1,-1,-1):
    for column in original_columns: 
        shifted_columns.append(column+"_t-"+str(i))
        
#Creating df from reshape array and shifted column names
X = pd.DataFrame(X, columns=shifted_columns)

print(X.shape, y.shape)

### Output

In [None]:
X, y, scaler_X, scaler_y