In [0]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import numpy as np
import missingno

from IPython.display import display

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from category_encoders import BinaryEncoder

ModuleNotFoundError: ignored

# 1. Wrangling for models
Docs: https://docs.google.com/document/d/1w7tmB_lGJ1YjbO7CiadMMcQ-diDAIrWJmFNgE7QW6IY/edit

## 1.1 Model v1
X -> time series from tca_max (tca=7) to tca=2 for each event. 
(3d array: events, timestep, features)

y -> minimum value from tca=2 to tca_min (tca~0) for each event.
(1d array)

### Input

In [0]:
df = pd.read_csv("train_data.csv")
timestep = 17 #from 1 to 23 (17 with the current NaN strategy)
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [0]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns

X = np.zeros((events,timestep,features))
i = 0

def df_to_3darray(event):
    global X, i
    #Transforming an event to time series (1,timesteps, columns)
    row = event.values.reshape(1,event.shape[0],event.shape[1])
    #Condition is needed to slice arrays correctly
    #Condition -> is timestep greater than the event's time series length? 
    if(timestep>=row.shape[1]):
        X[i:i+1,-row.shape[1]:,:] = row
    else:
        X[i:i+1,:,:] = row[:,-timestep:,:]
    #index to iterate over X array
    i = i + 1
    #dataframe remains intact, while X array has been filled.
    return event

df.groupby("event_id").apply(df_to_3darray)

#Dropping event_id to remove noise
X = X[:,:,1:]

#TODO: Padding with specific values column-wise instead of zeros.
#TODO: Separating time dependent and independent feature in 2 X arrays

print(X.shape, y.shape)

(7311, 17, 106) (7311, 1)


### Output

In [0]:
X, y, X_scaler, y_scaler, df

# 2. Wrangling for feature selection

## 2.1 Feature Selection v1
Description: Emulating Model v1 w/o time series (3D-array)

X -> Dataframe with data from tca_max (tca=7) to tca=2. 
(2d df: instances, features)

y -> minimum value from tca=2 to tca_min (tca~0) for each event repeated for all instances of each event.
(1d array)

### Input

In [0]:
df = pd.read_csv("train_data.csv")
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [0]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Creating target feature as last risk value of an event
df["target-risk"] = df.groupby('event_id')['risk'].transform(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Slicing data (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Getting and scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(df["target-risk"].values.reshape(-1, 1))

#Getting and scaling X
X = df.drop(["event_id","target-risk"], axis=1)
X = pd.DataFrame(X_scaler.fit_transform(X), columns=X.columns)

print(X.shape, y.shape)

(85733, 106) (85733, 1)


### Output

In [0]:
X, y, X_scaler, y_scaler

## 2.2 Feature Selection v2
Description: Minimum processing for predicting risk instance-wise instead of event-wise.

X -> Raw dataframe processed. (2D - df)

y -> Target feature (1d - array)

### Input

In [0]:
df = pd.read_csv("train_data.csv")
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [0]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Getting and scaling y
y = df["risk"].values.reshape(-1, 1)
y = y_scaler.fit_transform(y)

#Getting and Scaling X
X = df.drop(["event_id","risk"], axis=1)
X = pd.DataFrame(X_scaler.fit_transform(X), columns=X.columns)

print(X.shape, y.shape)

(143294, 105) (143294, 1)


### Output

In [0]:
X, y, X_scaler, y_scaler

## 2.3 Feature Selection v3.1
THIS VERSION DOES NOT SUPPORT FILLING

Description: Time series analysis for each feature. Also valid for FFN

X ->  dataframe with shifted columns and rows sliced as model version 1

y ->  same as model version 1

### Input

In [0]:
df = pd.read_csv("train_data.csv")
features = ["risk"] #Array!! Time features that you want to analyze
timestep = 17 #from 1 to 23 (17 with the current NaN strategy)
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [0]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Shifting features
features.append('event_id')
X = pd.DataFrame()

for feature in features:
    for i in range(timestep-1,-1,-1):
        X[feature+'_t-'+str(i)] = df[feature].shift(i)
    
#Getting last row -> getting one row per event
X = X.groupby(["event_id_t-0"]).apply(lambda x: x.iloc[-1])
X = X.reset_index(drop=True)

#Deleting rows with more than one event_id
for i in range(timestep):
    X = X.loc[X["event_id_t-0"]==X["event_id_t-"+str(i)]]

#Dropping y values not included in X due to last condition
y = np.take(y, X.index)
y = y.reshape(-1, 1)

#Deleting event_id features
event_features = [feature for feature in list(X.columns) if feature.startswith('event_id_t-')]
X = X.drop(event_features, axis=1)

print(X.shape, y.shape)

#TODO: not dropping y values bc of X but filling X with zeros

(1, 85) (1, 1)


Unnamed: 0,relative_position_n_t-16,relative_position_n_t-15,relative_position_n_t-14,relative_position_n_t-13,relative_position_n_t-12,relative_position_n_t-11,relative_position_n_t-10,relative_position_n_t-9,relative_position_n_t-8,relative_position_n_t-7,relative_position_n_t-6,relative_position_n_t-5,relative_position_n_t-4,relative_position_n_t-3,relative_position_n_t-2,relative_position_n_t-1,relative_position_n_t-0,relative_velocity_r_t-16,relative_velocity_r_t-15,relative_velocity_r_t-14,relative_velocity_r_t-13,relative_velocity_r_t-12,relative_velocity_r_t-11,relative_velocity_r_t-10,relative_velocity_r_t-9,relative_velocity_r_t-8,relative_velocity_r_t-7,relative_velocity_r_t-6,relative_velocity_r_t-5,relative_velocity_r_t-4,relative_velocity_r_t-3,relative_velocity_r_t-2,relative_velocity_r_t-1,relative_velocity_r_t-0,relative_velocity_t_t-16,relative_velocity_t_t-15,relative_velocity_t_t-14,relative_velocity_t_t-13,relative_velocity_t_t-12,relative_velocity_t_t-11,relative_velocity_t_t-10,relative_velocity_t_t-9,relative_velocity_t_t-8,relative_velocity_t_t-7,relative_velocity_t_t-6,relative_velocity_t_t-5,relative_velocity_t_t-4,relative_velocity_t_t-3,relative_velocity_t_t-2,relative_velocity_t_t-1,relative_velocity_t_t-0,relative_velocity_n_t-16,relative_velocity_n_t-15,relative_velocity_n_t-14,relative_velocity_n_t-13,relative_velocity_n_t-12,relative_velocity_n_t-11,relative_velocity_n_t-10,relative_velocity_n_t-9,relative_velocity_n_t-8,relative_velocity_n_t-7,relative_velocity_n_t-6,relative_velocity_n_t-5,relative_velocity_n_t-4,relative_velocity_n_t-3,relative_velocity_n_t-2,relative_velocity_n_t-1,relative_velocity_n_t-0,t_time_lastob_start_t-16,t_time_lastob_start_t-15,t_time_lastob_start_t-14,t_time_lastob_start_t-13,t_time_lastob_start_t-12,t_time_lastob_start_t-11,t_time_lastob_start_t-10,t_time_lastob_start_t-9,t_time_lastob_start_t-8,t_time_lastob_start_t-7,t_time_lastob_start_t-6,t_time_lastob_start_t-5,t_time_lastob_start_t-4,t_time_lastob_start_t-3,t_time_lastob_start_t-2,t_time_lastob_start_t-1,t_time_lastob_start_t-0
3927,0.488575,0.495217,0.493934,0.493933,0.494542,0.503581,0.499421,0.49669,0.497545,0.496896,0.497114,0.498001,0.495155,0.494288,0.487686,0.486254,0.485287,0.473191,0.472494,0.472643,0.472643,0.472568,0.471598,0.472046,0.472345,0.472245,0.47232,0.472295,0.472195,0.472494,0.472593,0.47329,0.473439,0.473564,0.92693,0.92693,0.926924,0.926924,0.926924,0.92693,0.92693,0.92693,0.926924,0.92693,0.92693,0.92693,0.926924,0.926924,0.926924,0.926924,0.926924,0.682228,0.682228,0.682228,0.682228,0.682228,0.682233,0.682233,0.682233,0.682233,0.682233,0.682233,0.682233,0.682228,0.682228,0.682228,0.682233,0.682233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Output

In [0]:
X, y, X_scaler, y_scaler

## 2.4 Feature Selection v3.2
THIS VERSION SUPPORTS FILLING

Description: Time series analysis for each feature. Also valid for FFN.

X ->  dataframe with shifted columns and rows sliced as model version 1

y ->  same as model version 1

### Input

In [0]:
df = pd.read_csv("train_data.csv")
features_ = ["risk"] #Array!! Time features that you want to analyze
timestep = 10 #from 1 to 23 (17 with the current NaN strategy)
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

### Data Processing

In [0]:
#Dropping first the empty column and then rows with NaNs
df = df.drop("c_rcs_estimate", axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('value_counts')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)

#Selecting features
features_.insert(0, 'event_id')
df = df[features_]

#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns
event_id
X = np.zeros((events,timestep,features))
i = 0

def df_to_3darray(event):
    global X, i
    #Transforming an event to time series (1,timesteps, columns)
    row = event.values.reshape(1,event.shape[0],event.shape[1])
    #Condition is needed to slice arrays correctly
    #Condition -> is timestep greater than the event's time series length? 
    if(timestep>=row.shape[1]):
        X[i:i+1,-row.shape[1]:,:] = row
    else:
        X[i:i+1,:,:] = row[:,-timestep:,:]
    #index to iterate over X array
    i = i + 1
    #dataframe remains intact, while X array has been filled.
    return event

df.groupby("event_id").apply(df_to_3darray)

#Dropping event_id to remove noise
X = X[:,:,1:]

#Reshaping again to 2D array but now events are filled
X = X.reshape(X.shape[0], timestep*X.shape[2])

#Naming shifted columns
shifted_columns = []
original_columns = list(df.columns)[1:] #Dropping event_id

for i in range(timestep-1,-1,-1):
    for column in original_columns: 
        shifted_columns.append(column+"_t-"+str(i))
        
#Creating df from reshape array and shifted column names
X = pd.DataFrame(X, columns=shifted_columns)

print(X.shape, y.shape)

(7311, 10) (7311, 1)


### Output

In [0]:
X, y, scaler_X, scaler_y