In [1]:
from tsfresh.examples import load_robot_execution_failures
import pandas as pd
import tsfresh
import matplotlib.pyplot as plt

from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from ML.feature_selection import FeatureSelection
from ML.model import Model, min_acc
from ML.process_data import Data
from utils.help_classes import ScalerEnum, ModelsEnum
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler

from utils.help_classes import ModelsEnum
from sklearn.model_selection import train_test_split
import os
import datetime
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

import pickle
import numpy as np
import category_encoders as ce
import plotly_express as px
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from numpy.linalg import LinAlgError

In [2]:
X = pd.read_csv(os.path.join('data_csv', 'features_train.csv'))
X['time_stamp'] = pd.to_datetime(X['time_stamp'])

In [3]:
X_train = X.loc[X['time_stamp'] < X['time_stamp'].quantile(q=0.9)]
X_val = X.loc[X['time_stamp'] >= X['time_stamp'].quantile(q=0.9)]

In [4]:
print('X_train', X_train.shape)
print('X_val', X_val.shape)

X_train (29466, 56)
X_val (3275, 56)


In [5]:
y_train = X_train['label']
X_train = X_train.drop(columns=['label', 'time_stamp', 'traj'])

y_val = X_val['label']
X_val = X_val.drop(columns=['label', 'time_stamp', 'traj'])

In [6]:
print('before sampling', X_train.shape)
X_train = X_train.fillna(X_train.mean())

sampling_strategy = "not minority"
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_train, y_train = rus.fit_resample(X_train, y_train)
print('after sampling', X_train.shape)

before sampling (29466, 53)
after sampling (6381, 53)


In [7]:
X_train = X_train.fillna(X_train.median())
X_val = X_val.fillna(X_val.median())

In [8]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [22]:
class_weight = {0: 0.3, 1: 8, 2: 2}

In [23]:
ml_model = Model(model_name="time_series_3_" + str(ModelsEnum.Xgboost), classifier_method=ModelsEnum.Xgboost,  
                 verbose=True, binary_task=False, write_csv=True, class_weight=class_weight)

trained_model, results_train = ml_model.cross_fold_validation(X_train, y_train, grid_search=False, cv=3, 
                                                              n_iter=30, scoring=make_scorer(min_acc), n_jobs=3)
results_dict = ml_model.test_set(X_val, y_val)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
{'accuracy': [0.9490076335877863], 'f1_': [0.9336897877315735], 'jaccard': [0.9029633933759442], 'recall': [0.9490076335877863], 'min_acc': [0.9239130434782609], 'acc_0': [0.9548098434004474], 'acc_1': [0.9239130434782609], 'acc_2': [0.9858490566037735]}


Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption


## Preparing submission

In [24]:
X_test = pd.read_csv(os.path.join('data_csv', 'features_test.csv'))

In [25]:
traj = X_test['traj'].values

In [26]:
X_test = X_test.drop(columns=['label', 'time_stamp', 'traj'])
X_test = X_test.fillna(X_test.median())
X_test = scaler.transform(X_test)

In [27]:
final_prediction = trained_model.predict(X_test)

Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption


In [28]:
test_predictions = pd.DataFrame({
        'trajectory_ind': traj,
        'label': final_prediction
    })
test_predictions.to_csv('submission_2_StackedModel.csv', index=False)