In [1]:
%%capture
# We need to alter the working directory since the scripts are written expecting that ./time_to_failure will be the working directory
%cd ..

In [2]:
import pandas as pd
import os
import json
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from data_processing import DataProcessor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def get_cols_with_missings(df: pd.DataFrame) -> List:
    """Returns a list of columns in a Pandas DataFrame with missing values

    Args:
        df (pd.DataFrame): input DataFrame

    Returns:
        List: list of columns with >=1 missing value
    """
    missing_share =  df.isnull().mean().reset_index()
    return missing_share[missing_share[0] > 0]['index'].tolist()

# Data Download

Data as uncompressed CSVs can be downloaded [here](https://stockholmuniversity.app.box.com/s/anmg5k93pux5p6decqzzwokp9vuzmdkh). As explained in the corresponding [white paper](https://arxiv.org/pdf/2401.15199) the data are subset into train, validation, and test splits, where only the training data contain complete information on a vehicle's entire opertional history. We therefore restrict to just the training data.

We downloaded the data locally and performed some simple merges and analyses locally.

# Read Data

In [3]:
parent_dir = '/Users/josephking/Downloads/scania_data'

train_operational = pd.read_csv(os.path.join(parent_dir, 'train_operational_readouts.csv'))
train_specifications = pd.read_csv(os.path.join(parent_dir, 'train_specifications.csv'))
train_tte = pd.read_csv(os.path.join(parent_dir, 'train_tte.csv'))

# Exploratory Data Analysis

### Operational Readouts

In [4]:
print(train_operational.shape)
train_operational.head()

(1122452, 107)


Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,167_4,167_5,167_6,167_7,167_8,167_9,309_0,272_0,272_1,272_2,272_3,272_4,272_5,272_6,272_7,272_8,272_9,835_0,370_0,291_0,291_1,291_2,291_3,291_4,291_5,291_6,291_7,291_8,291_9,291_10,158_0,158_1,158_2,158_3,158_4,158_5,158_6,158_7,158_8,158_9,100_0,459_0,459_1,459_2,459_3,459_4,459_5,459_6,459_7,459_8,459_9,459_10,459_11,459_12,459_13,459_14,459_15,459_16,459_17,459_18,459_19,397_0,397_1,397_2,397_3,397_4,397_5,397_6,397_7,397_8,397_9,397_10,397_11,397_12,397_13,397_14,397_15,397_16,397_17,397_18,397_19,397_20,397_21,397_22,397_23,397_24,397_25,397_26,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35
0,0,11.2,167985.0,10787.0,7413813.0,2296.0,4110.0,1296420.0,1628265.0,630345.0,1269525.0,4772940.0,2706706.0,222225.0,6240.0,0.0,70.0,1435083.0,857662.0,384579.0,668642.0,7239843.0,398490.0,3887.0,0.0,0.0,0.0,8036751.0,0.0,1227.0,555.0,463.0,925.0,468.0,225.0,535.0,516.0,492.0,729.0,66.0,97056.0,2690052.0,2945268.0,788437.0,687480.0,595164.0,491232.0,532932.0,809628.0,505693.0,858410.0,203.676778,111.9115,147.265389,200.479944,230.306278,277.722417,315.748806,372.164528,864.24625,920.881111,637.901639,744.618944,880.866889,1272.323972,1847.623667,940.785694,2.900083,0.208444,0.056417,0.058444,446956.0,411420.0,203024.0,26636.0,29156.0,7616.0,449537.0,233352.0,139920.0,12648.0,2813.0,224.0,53161.0,178881.0,138250.0,13328.0,3581.0,88.0,16361.0,131601.0,116541.0,13506.0,2856.0,48.0,6337.0,105412.0,95728.0,15609.0,1984.0,8.0,784.0,150228.0,261904.0,93172.0,17874.0,452.0
1,0,11.4,167985.0,10787.0,7413813.0,2296.0,4111.0,1302855.0,1628265.0,630345.0,1269526.0,4772940.0,2706706.0,222225.0,6240.0,0.0,70.0,1440661.0,857662.0,384579.0,668642.0,7239843.0,398490.0,3887.0,0.0,0.0,0.0,8040811.0,0.0,1230.0,558.0,463.0,925.0,469.0,226.0,535.0,516.0,493.0,729.0,66.0,97056.0,2693100.0,2947368.0,788437.0,687480.0,595164.0,491232.0,532932.0,809628.0,505693.0,860571.0,204.25675,112.92425,147.265389,201.479944,230.306278,277.722417,315.748806,372.164528,864.24625,920.881111,637.901639,745.618944,880.866889,1272.323972,1847.623667,940.785694,2.900083,0.208444,0.056417,0.058444,446964.0,411420.0,203027.0,26638.0,29157.0,7616.0,451193.0,233354.0,139920.0,12649.0,2813.0,224.0,53210.0,178883.0,138252.0,13328.0,3582.0,88.0,16368.0,131601.0,116542.0,13507.0,2856.0,48.0,6339.0,105413.0,95729.0,15610.0,1984.0,8.0,784.0,150228.0,261905.0,93172.0,17874.0,452.0
2,0,19.6,331635.0,14525.0,13683604.0,2600.0,,,,,,,,,,,70.0,1787736.0,1133132.0,598351.0,1167062.0,12314224.0,460240.0,3887.0,0.0,0.0,0.0,12777022.0,0.0,2136.0,954.0,850.0,1420.0,722.0,412.0,880.0,666.0,586.0,1143.0,162.0,181632.0,4249020.0,4630440.0,1539133.0,1421172.0,1039764.0,749472.0,740724.0,995796.0,574045.0,1379191.0,321.671972,157.3125,193.792833,263.577611,310.711861,366.14925,415.642472,484.391167,1146.111611,1286.536333,900.062917,1123.232556,1449.545611,2140.037472,5046.748278,1151.010139,3.320194,0.218806,0.056778,0.058444,756665.0,647348.0,286811.0,30967.0,31213.0,7745.0,633790.0,423395.0,271940.0,16190.0,3573.0,232.0,75038.0,352791.0,327992.0,17325.0,4451.0,92.0,24028.0,234737.0,216619.0,17000.0,3476.0,48.0,12055.0,167693.0,142900.0,19263.0,2441.0,12.0,1420.0,204832.0,313485.0,106464.0,19306.0,452.0
3,0,20.2,354975.0,15015.0,14540449.0,2616.0,,,,,,,,,,,70.0,1824409.0,1166074.0,634595.0,1233908.0,13275730.0,466753.0,3887.0,0.0,0.0,0.0,13612083.0,0.0,2218.0,1014.0,892.0,1471.0,749.0,425.0,901.0,702.0,589.0,1197.0,174.0,193728.0,4462548.0,4988028.0,1696022.0,1565484.0,1112544.0,789228.0,774588.0,1015104.0,576901.0,1428606.0,331.479028,162.731639,198.104472,269.712889,320.087333,377.478667,425.901361,495.749583,1173.882583,1323.460972,923.099361,1161.893139,1501.973944,2208.782833,5587.856667,1160.593833,3.336417,0.218806,0.056778,0.058444,812577.0,686860.0,302955.0,31927.0,31488.0,7749.0,651902.0,478279.0,292109.0,16755.0,3753.0,232.0,77118.0,394083.0,359060.0,17941.0,4573.0,92.0,25164.0,253706.0,232912.0,17583.0,3573.0,48.0,13199.0,176596.0,150565.0,19832.0,2522.0,12.0,1444.0,211688.0,318901.0,107745.0,19406.0,453.0
4,0,21.0,365550.0,15295.0,14966985.0,2720.0,,,,,,,,,,,70.0,1873614.0,1176071.0,639587.0,1266369.0,13758524.0,474047.0,3887.0,0.0,0.0,0.0,14041353.0,0.0,2272.0,1038.0,910.0,1519.0,773.0,440.0,946.0,727.0,599.0,1233.0,178.0,203568.0,4621885.0,5107368.0,1758350.0,1633717.0,1159656.0,820368.0,792996.0,1030644.0,587234.0,1460547.0,339.563444,166.705583,202.430194,275.669056,327.877944,385.460472,435.321167,506.110639,1201.040833,1351.811444,944.13325,1192.740333,1544.355778,2307.900278,5738.170167,1182.611833,3.36725,0.218806,0.056778,0.058444,843929.0,710424.0,314292.0,32585.0,31896.0,7773.0,665154.0,488271.0,300083.0,17265.0,3998.0,232.0,79222.0,411299.0,371612.0,18614.0,4681.0,92.0,25736.0,268834.0,242477.0,18108.0,3641.0,52.0,13400.0,183724.0,155913.0,20573.0,2562.0,12.0,1445.0,213956.0,323997.0,109514.0,19535.0,454.0


In [5]:
train_operational.describe()

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,167_4,167_5,167_6,167_7,167_8,167_9,309_0,272_0,272_1,272_2,272_3,272_4,272_5,272_6,272_7,272_8,272_9,835_0,370_0,291_0,291_1,291_2,291_3,291_4,291_5,291_6,291_7,291_8,291_9,291_10,158_0,158_1,158_2,158_3,158_4,158_5,158_6,158_7,158_8,158_9,100_0,459_0,459_1,459_2,459_3,459_4,459_5,459_6,459_7,459_8,459_9,459_10,459_11,459_12,459_13,459_14,459_15,459_16,459_17,459_18,459_19,397_0,397_1,397_2,397_3,397_4,397_5,397_6,397_7,397_8,397_9,397_10,397_11,397_12,397_13,397_14,397_15,397_16,397_17,397_18,397_19,397_20,397_21,397_22,397_23,397_24,397_25,397_26,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35
count,1122452.0,1122452.0,1122452.0,1122412.0,1116047.0,1122413.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1119590.0,1122418.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1121927.0,1122413.0,1116054.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1112824.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1121804.0,1116051.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1114099.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0,1121842.0
mean,15635.01,142.6336,3432285.0,122051.9,129726900.0,44327.35,12314.84,6578556.0,16476730.0,13480330.0,39489360.0,129104300.0,16165050.0,1237457.0,113070.8,25135.15,11607.04,9095183.0,8871166.0,6888060.0,35456620.0,133526000.0,5968650.0,53038.56,2408.601,454.7189,269.8576,146131800.0,6571623.0,24582.36,62540.47,1036058.0,19637.23,6725.472,4608.888,8919.47,6189.571,5234.454,15064.62,19048.95,3089379.0,48286190.0,64597120.0,24841400.0,17062630.0,9259272.0,5636848.0,4753604.0,4354528.0,2592706.0,14255110.0,4258.535,1169.722,1173.479,1380.093,1625.984,1849.545,2204.264,2558.352,6355.295,8025.213,5032.235,6009.701,7153.963,10933.45,28168.74,42070.59,3864.298,12.20838,0.8265065,0.8791815,11446640.0,6736448.0,6528357.0,691255.5,637593.4,60564.88,7364856.0,4162901.0,7306592.0,983699.6,337844.5,1300131.0,1406516.0,2456044.0,7839299.0,477928.3,129476.9,2820.697,221802.1,1542952.0,2905751.0,279217.0,719699.6,1341793.0,132235.2,899845.5,1521964.0,153644.2,1367280.0,2570.485,18231.38,654211.3,1526108.0,482417.4,158348.9,2484.948
std,9802.149,95.29169,2766660.0,144144.4,100530800.0,79823.17,45239.0,5408065.0,14817070.0,14635250.0,44795420.0,103824400.0,21840200.0,2920862.0,524403.1,207442.3,68852.84,8015971.0,8921667.0,10695620.0,69657570.0,119269700.0,11426530.0,301352.6,35113.83,7235.416,4293.17,111319900.0,19032660.0,472430.9,5170522.0,109484900.0,24310.65,11634.33,16458.4,74863.3,147819.3,245042.1,735080.0,1836404.0,4000282.0,36661000.0,51915590.0,22644680.0,14980010.0,8393817.0,5297305.0,4684814.0,4957467.0,5270247.0,13346320.0,5997.624,1442.711,1096.282,1327.374,1619.487,1884.419,2229.143,2551.331,6175.327,7756.641,4837.041,5824.691,7091.003,13097.3,35884.03,44600.67,11459.67,626.6429,23.93927,60.77774,127038400.0,26998680.0,155264000.0,15862270.0,53200510.0,132318.8,210138500.0,107612900.0,20704470.0,106355800.0,43821740.0,206767300.0,94508240.0,16298050.0,159528900.0,26034810.0,9831869.0,126951.2,404072.6,26249590.0,2904026.0,13456370.0,107207300.0,213950500.0,591795.0,1073817.0,1503821.0,440630.5,212506700.0,253458.4,254468.1,930402.1,1865814.0,611943.8,339737.2,10591.74
min,0.0,0.0,15.0,0.0,605.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1710.0,0.3240278,0.1479444,0.1090278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,16.0,0.0,0.0,0.0,0.0,1364.0,64.0,12.0,0.0,0.0,0.0,176.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6726.0,66.4,1288320.0,29703.75,51809040.0,2848.0,432.0,2640738.0,6241352.0,4709285.0,12397520.0,48340800.0,2874335.0,36075.0,0.0,0.0,56.0,3449884.0,3191300.0,2222834.0,5620590.0,36071610.0,405302.0,0.0,0.0,0.0,0.0,60797900.0,0.0,5872.0,4541.0,4665.0,6817.0,2206.0,1403.0,2518.0,1417.0,851.0,3036.0,317.0,318131.2,20521450.0,26178260.0,8606923.0,6127723.0,3301326.0,1913846.0,1468621.0,1026072.0,18447.0,5096705.0,1213.985,435.7974,444.4453,502.2648,573.1522,638.5362,759.0035,887.9294,2255.27,2851.361,1761.602,2067.298,2398.233,3411.42,6738.877,7604.409,8.725569,0.1644722,0.0035,0.00375,3224877.0,2646978.0,1825747.0,146366.0,64835.25,4056.0,1981458.0,1193138.0,2027915.0,55367.0,12097.0,377.0,228421.2,739746.2,1860386.0,49435.0,12881.0,172.0,78535.0,468062.2,854455.2,41812.0,6477.0,132.0,44243.0,300937.0,453661.2,38658.0,5399.0,36.0,4855.25,151338.0,360702.2,105034.0,17439.0,64.0
50%,15783.0,127.8,2781472.0,76455.0,108090600.0,15755.0,3570.0,5276156.0,12755050.0,9788470.0,26255140.0,105788100.0,9535204.0,385592.5,7680.0,0.0,683.0,7026333.0,6607915.0,4592760.0,12024960.0,107275500.0,2387937.0,2692.0,0.0,0.0,0.0,123203100.0,0.0,12455.0,9485.0,9705.0,14064.0,4626.0,2992.0,5410.0,3070.0,1844.0,6352.0,1343.0,1511268.0,41495810.0,53268310.0,18595030.0,13185600.0,7084348.0,4205950.0,3447218.0,2824551.0,627528.0,10748040.0,2595.937,878.7439,900.6592,1025.319,1178.557,1319.399,1572.231,1839.786,4662.365,5928.28,3724.817,4428.59,5204.782,7526.488,16520.54,27230.49,41.9395,0.9408611,0.03994444,0.01452778,7889388.0,5366132.0,4130627.0,336980.0,170773.0,18192.0,4163424.0,2477272.0,5174216.0,118171.0,28574.0,1158.0,485176.5,1546888.0,4859318.0,106638.0,29774.0,579.0,156976.0,994038.0,2053424.0,93160.5,16729.0,435.0,97028.0,658813.5,1082700.0,88852.0,14374.0,132.0,10927.0,384858.0,947301.0,278923.0,54575.5,306.0
75%,23542.0,206.0,4949614.0,162098.0,185369000.0,52123.0,12996.0,9018105.0,22374550.0,17479850.0,49865760.0,186034100.0,20976230.0,1393205.0,76308.0,5855.0,5210.0,12352920.0,11844210.0,8250888.0,26355030.0,200735400.0,7007335.0,25854.0,0.0,0.0,0.0,207090400.0,1648879.0,24003.0,17619.0,17799.0,25145.0,8461.0,5640.0,10436.0,6042.0,3609.0,11222.0,2883.0,4355550.0,68170880.0,90423160.0,34277720.0,23827060.0,12811850.0,7801738.0,6623959.0,5920724.0,2592550.0,19448940.0,4934.977,1520.909,1573.247,1835.537,2153.427,2433.098,2902.92,3374.248,8445.228,10681.3,6751.956,8093.242,9594.513,14007.82,34544.42,62594.66,243.0237,3.532194,0.2114444,0.08216667,14857940.0,9198015.0,7749890.0,705460.2,386425.2,60660.0,7667040.0,4414777.0,10277740.0,238007.8,61497.0,3211.0,891255.2,2801641.0,9788551.0,217206.8,61815.0,1686.0,265549.8,1802558.0,4065984.0,191259.0,39773.75,1266.0,173130.0,1221320.0,2136832.0,180694.0,35345.0,468.0,20388.0,819442.0,2016070.0,618642.0,151184.8,1366.0
max,33643.0,507.4,30489600.0,5886459.0,1046455000.0,6524252.0,7526577.0,67236170.0,303243900.0,409433000.0,703208600.0,861604600.0,665291000.0,134558700.0,36800030.0,20688080.0,11186410.0,113043900.0,208307000.0,365073500.0,1099679000.0,1142990000.0,329054300.0,36738900.0,2293279.0,542479.0,393149.0,1022947000.0,282441800.0,50529460.0,553847600.0,11727370000.0,1258941.0,885359.0,1671618.0,7963772.0,15828040.0,26247910.0,78743130.0,196706600.0,52693080.0,509174400.0,549031900.0,235523500.0,170738600.0,144663600.0,94729780.0,100707200.0,74740860.0,115340600.0,280237600.0,178716.8,114565.0,23932.47,37603.5,52240.18,54393.73,46286.2,34825.5,75370.64,97590.06,59121.39,89633.8,103368.4,327482.5,548367.9,538884.0,170762.1,125184.8,4228.947,12986.19,20470900000.0,4287284000.0,24721780000.0,2559794000.0,8593610000.0,4705939.0,33908290000.0,17181540000.0,3162025000.0,17178890000.0,7061902000.0,33105160000.0,15144400000.0,2560500000.0,25460230000.0,4154567000.0,1569277000.0,16787840.0,26493470.0,4187476000.0,51353420.0,2147911000.0,17118800000.0,34163410000.0,75667380.0,84572680.0,34967720.0,50591650.0,34225870000.0,33558720.0,33587880.0,34027220.0,34751350.0,8870681.0,8707186.0,586958.0


In [6]:
pd.DataFrame(train_operational.nunique()).T

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,167_4,167_5,167_6,167_7,167_8,167_9,309_0,272_0,272_1,272_2,272_3,272_4,272_5,272_6,272_7,272_8,272_9,835_0,370_0,291_0,291_1,291_2,291_3,291_4,291_5,291_6,291_7,291_8,291_9,291_10,158_0,158_1,158_2,158_3,158_4,158_5,158_6,158_7,158_8,158_9,100_0,459_0,459_1,459_2,459_3,459_4,459_5,459_6,459_7,459_8,459_9,459_10,459_11,459_12,459_13,459_14,459_15,459_16,459_17,459_18,459_19,397_0,397_1,397_2,397_3,397_4,397_5,397_6,397_7,397_8,397_9,397_10,397_11,397_12,397_13,397_14,397_15,397_16,397_17,397_18,397_19,397_20,397_21,397_22,397_23,397_24,397_25,397_26,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35
0,23550,2499,489765,334748,1102424,195376,60125,1012699,1053628,1029759,1061733,1069558,949921,558864,195480,78095,88522,1048989,1028813,995930,1054438,1069637,860299,145336,11989,6931,5580,1113460,322120,102100,77893,70518,88979,41597,31952,50480,34366,24848,41445,12252,891010,1104381,1105852,1078935,1063281,1033672,998015,980513,956297,699768,1088038,1096660,1091997,1089175,1086439,1083619,1080564,1078242,1076985,1076475,1075358,1073397,1072428,1071436,1070200,1069234,1062603,967585,540134,176709,87479,1089190,1073256,1060520,712166,547194,223257,1067078,1030577,1072353,450185,202569,39618,785524,982662,1065138,429656,201559,26643,445901,917353,1007391,391415,172182,22199,332871,840061,930447,368960,163267,15502,82755,731125,902735,657916,353455,36607


In [7]:
pd.DataFrame(train_operational.isnull().mean()).T

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,167_4,167_5,167_6,167_7,167_8,167_9,309_0,272_0,272_1,272_2,272_3,272_4,272_5,272_6,272_7,272_8,272_9,835_0,370_0,291_0,291_1,291_2,291_3,291_4,291_5,291_6,291_7,291_8,291_9,291_10,158_0,158_1,158_2,158_3,158_4,158_5,158_6,158_7,158_8,158_9,100_0,459_0,459_1,459_2,459_3,459_4,459_5,459_6,459_7,459_8,459_9,459_10,459_11,459_12,459_13,459_14,459_15,459_16,459_17,459_18,459_19,397_0,397_1,397_2,397_3,397_4,397_5,397_6,397_7,397_8,397_9,397_10,397_11,397_12,397_13,397_14,397_15,397_16,397_17,397_18,397_19,397_20,397_21,397_22,397_23,397_24,397_25,397_26,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35
0,0.0,0.0,0.0,3.6e-05,0.005706,3.5e-05,0.00255,0.00255,0.00255,0.00255,0.00255,0.00255,0.00255,0.00255,0.00255,0.00255,3e-05,0.000468,0.000468,0.000468,0.000468,0.000468,0.000468,0.000468,0.000468,0.000468,0.000468,3.5e-05,0.0057,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.008578,0.000577,0.000577,0.000577,0.000577,0.000577,0.000577,0.000577,0.000577,0.000577,0.000577,0.005703,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.007442,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543,0.000543


### Specifications

In [8]:
print(train_specifications.shape)
train_specifications.head()

(23550, 9)


Unnamed: 0,vehicle_id,Spec_0,Spec_1,Spec_2,Spec_3,Spec_4,Spec_5,Spec_6,Spec_7
0,0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
1,2,Cat0,Cat1,Cat1,Cat0,Cat0,Cat0,Cat0,Cat1
2,3,Cat0,Cat1,Cat1,Cat1,Cat0,Cat0,Cat0,Cat1
3,4,Cat0,Cat0,Cat2,Cat1,Cat0,Cat0,Cat0,Cat1
4,5,Cat0,Cat2,Cat2,Cat0,Cat0,Cat0,Cat0,Cat1


In [9]:
train_specifications.nunique()

vehicle_id    23550
Spec_0            3
Spec_1           29
Spec_2           21
Spec_3            4
Spec_4            2
Spec_5            5
Spec_6           17
Spec_7            9
dtype: int64

In [10]:
train_specifications.isnull().mean()

vehicle_id    0.0
Spec_0        0.0
Spec_1        0.0
Spec_2        0.0
Spec_3        0.0
Spec_4        0.0
Spec_5        0.0
Spec_6        0.0
Spec_7        0.0
dtype: float64

### TTE

In [11]:
print(train_tte.shape)
train_tte.head()

(23550, 3)


Unnamed: 0,vehicle_id,length_of_study_time_step,in_study_repair
0,0,510.0,0
1,2,281.8,0
2,3,293.4,0
3,4,210.0,0
4,5,360.4,0


In [12]:
train_tte.describe()

Unnamed: 0,vehicle_id,length_of_study_time_step,in_study_repair
count,23550.0,23550.0,23550.0
mean,16851.663185,240.349019,0.096476
std,9714.30942,88.782437,0.295249
min,0.0,73.4,0.0
25%,8468.25,163.8,0.0
50%,16882.5,218.2,0.0
75%,25247.75,312.0,0.0
max,33643.0,510.0,1.0


In [13]:
train_tte.isnull().mean()

vehicle_id                   0.0
length_of_study_time_step    0.0
in_study_repair              0.0
dtype: float64

### TTE lifespan versus max observed lifespan in `train_operational`

As we see below, the last actual observation is typically about 5.6 timestamps later than the last observation in the `train_operational` dataset. There are no feature values at the time of actual last observation.

In [14]:
max_time_step = train_operational.groupby('vehicle_id')['time_step'].transform('max')
last_obs = train_operational[train_operational['time_step'] == max_time_step][['vehicle_id', 'time_step']]
last_obs = pd.merge(last_obs, train_tte, on='vehicle_id')

failed = last_obs[last_obs['in_study_repair']==1]
no_fail = failed = last_obs[last_obs['in_study_repair']==0]


In [15]:
(last_obs['length_of_study_time_step'] - last_obs['time_step']).describe()

count    23550.000000
mean         5.588365
std         11.877423
min          0.200000
25%          0.800000
50%          2.000000
75%          5.000000
max        324.600000
dtype: float64

In [16]:
(failed['length_of_study_time_step'] - failed['time_step']).describe()

count    21278.000000
mean         5.771510
std         11.875166
min          0.200000
25%          0.800000
50%          2.200000
75%          5.200000
max        324.600000
dtype: float64

In [17]:
(no_fail['length_of_study_time_step'] - no_fail['time_step']).describe()

count    21278.000000
mean         5.771510
std         11.875166
min          0.200000
25%          0.800000
50%          2.200000
75%          5.200000
max        324.600000
dtype: float64

# Merge

In [18]:
df = pd.merge(train_operational, train_specifications, on='vehicle_id')

### Merge using the last timestamp contained in `train_operational`, treat this as failure/censored timepoint

In [19]:
df = pd.merge(df, last_obs[['vehicle_id', 'time_step', 'in_study_repair']], on=['vehicle_id', 'time_step'], how='left')

### Fill previous panel observations with 0 for `in_study_repair`

In [20]:
df['in_study_repair'] = df['in_study_repair'].fillna(0).astype(int)

### Rearrange columns

In [21]:
key_cols = ['vehicle_id', 'time_step', 'in_study_repair']
rest_cols = sorted([i for i in df.columns if i not in key_cols])
new_col_ordeer = key_cols + rest_cols

In [22]:
df = df[new_col_ordeer]

In [23]:
df = df.sort_values(by=['vehicle_id', 'time_step']).reset_index(drop=True)

In [24]:
df

Unnamed: 0,vehicle_id,time_step,in_study_repair,100_0,158_0,158_1,158_2,158_3,158_4,158_5,158_6,158_7,158_8,158_9,167_0,167_1,167_2,167_3,167_4,167_5,167_6,167_7,167_8,167_9,171_0,272_0,272_1,272_2,272_3,272_4,272_5,272_6,272_7,272_8,272_9,291_0,291_1,291_10,291_2,291_3,291_4,291_5,291_6,291_7,291_8,291_9,309_0,370_0,397_0,397_1,397_10,397_11,397_12,397_13,397_14,397_15,397_16,397_17,397_18,397_19,397_2,397_20,397_21,397_22,397_23,397_24,397_25,397_26,397_27,397_28,397_29,397_3,397_30,397_31,397_32,397_33,397_34,397_35,397_4,397_5,397_6,397_7,397_8,397_9,427_0,459_0,459_1,459_10,459_11,459_12,459_13,459_14,459_15,459_16,459_17,459_18,459_19,459_2,459_3,459_4,459_5,459_6,459_7,459_8,459_9,666_0,835_0,837_0,Spec_0,Spec_1,Spec_2,Spec_3,Spec_4,Spec_5,Spec_6,Spec_7
0,0,11.2,0,858410.0,97056.0,2690052.0,2945268.0,788437.0,687480.0,595164.0,491232.0,532932.0,809628.0,505693.0,4110.0,1296420.0,1628265.0,630345.0,1269525.0,4772940.0,2706706.0,222225.0,6240.0,0.0,167985.0,1435083.0,857662.0,384579.0,668642.0,7239843.0,398490.0,3887.0,0.0,0.0,0.0,1227.0,555.0,66.0,463.0,925.0,468.0,225.0,535.0,516.0,492.0,729.0,70.0,0.0,446956.0,411420.0,2813.0,224.0,53161.0,178881.0,138250.0,13328.0,3581.0,88.0,16361.0,131601.0,203024.0,116541.0,13506.0,2856.0,48.0,6337.0,105412.0,95728.0,15609.0,1984.0,8.0,26636.0,784.0,150228.0,261904.0,93172.0,17874.0,452.0,29156.0,7616.0,449537.0,233352.0,139920.0,12648.0,7413813.0,203.676778,111.911500,637.901639,744.618944,880.866889,1272.323972,1847.623667,940.785694,2.900083,0.208444,0.056417,0.058444,147.265389,200.479944,230.306278,277.722417,315.748806,372.164528,864.246250,920.881111,10787.0,8036751.0,2296.0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
1,0,11.4,0,860571.0,97056.0,2693100.0,2947368.0,788437.0,687480.0,595164.0,491232.0,532932.0,809628.0,505693.0,4111.0,1302855.0,1628265.0,630345.0,1269526.0,4772940.0,2706706.0,222225.0,6240.0,0.0,167985.0,1440661.0,857662.0,384579.0,668642.0,7239843.0,398490.0,3887.0,0.0,0.0,0.0,1230.0,558.0,66.0,463.0,925.0,469.0,226.0,535.0,516.0,493.0,729.0,70.0,0.0,446964.0,411420.0,2813.0,224.0,53210.0,178883.0,138252.0,13328.0,3582.0,88.0,16368.0,131601.0,203027.0,116542.0,13507.0,2856.0,48.0,6339.0,105413.0,95729.0,15610.0,1984.0,8.0,26638.0,784.0,150228.0,261905.0,93172.0,17874.0,452.0,29157.0,7616.0,451193.0,233354.0,139920.0,12649.0,7413813.0,204.256750,112.924250,637.901639,745.618944,880.866889,1272.323972,1847.623667,940.785694,2.900083,0.208444,0.056417,0.058444,147.265389,201.479944,230.306278,277.722417,315.748806,372.164528,864.246250,920.881111,10787.0,8040811.0,2296.0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
2,0,19.6,0,1379191.0,181632.0,4249020.0,4630440.0,1539133.0,1421172.0,1039764.0,749472.0,740724.0,995796.0,574045.0,,,,,,,,,,,331635.0,1787736.0,1133132.0,598351.0,1167062.0,12314224.0,460240.0,3887.0,0.0,0.0,0.0,2136.0,954.0,162.0,850.0,1420.0,722.0,412.0,880.0,666.0,586.0,1143.0,70.0,0.0,756665.0,647348.0,3573.0,232.0,75038.0,352791.0,327992.0,17325.0,4451.0,92.0,24028.0,234737.0,286811.0,216619.0,17000.0,3476.0,48.0,12055.0,167693.0,142900.0,19263.0,2441.0,12.0,30967.0,1420.0,204832.0,313485.0,106464.0,19306.0,452.0,31213.0,7745.0,633790.0,423395.0,271940.0,16190.0,13683604.0,321.671972,157.312500,900.062917,1123.232556,1449.545611,2140.037472,5046.748278,1151.010139,3.320194,0.218806,0.056778,0.058444,193.792833,263.577611,310.711861,366.149250,415.642472,484.391167,1146.111611,1286.536333,14525.0,12777022.0,2600.0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
3,0,20.2,0,1428606.0,193728.0,4462548.0,4988028.0,1696022.0,1565484.0,1112544.0,789228.0,774588.0,1015104.0,576901.0,,,,,,,,,,,354975.0,1824409.0,1166074.0,634595.0,1233908.0,13275730.0,466753.0,3887.0,0.0,0.0,0.0,2218.0,1014.0,174.0,892.0,1471.0,749.0,425.0,901.0,702.0,589.0,1197.0,70.0,0.0,812577.0,686860.0,3753.0,232.0,77118.0,394083.0,359060.0,17941.0,4573.0,92.0,25164.0,253706.0,302955.0,232912.0,17583.0,3573.0,48.0,13199.0,176596.0,150565.0,19832.0,2522.0,12.0,31927.0,1444.0,211688.0,318901.0,107745.0,19406.0,453.0,31488.0,7749.0,651902.0,478279.0,292109.0,16755.0,14540449.0,331.479028,162.731639,923.099361,1161.893139,1501.973944,2208.782833,5587.856667,1160.593833,3.336417,0.218806,0.056778,0.058444,198.104472,269.712889,320.087333,377.478667,425.901361,495.749583,1173.882583,1323.460972,15015.0,13612083.0,2616.0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
4,0,21.0,0,1460547.0,203568.0,4621885.0,5107368.0,1758350.0,1633717.0,1159656.0,820368.0,792996.0,1030644.0,587234.0,,,,,,,,,,,365550.0,1873614.0,1176071.0,639587.0,1266369.0,13758524.0,474047.0,3887.0,0.0,0.0,0.0,2272.0,1038.0,178.0,910.0,1519.0,773.0,440.0,946.0,727.0,599.0,1233.0,70.0,0.0,843929.0,710424.0,3998.0,232.0,79222.0,411299.0,371612.0,18614.0,4681.0,92.0,25736.0,268834.0,314292.0,242477.0,18108.0,3641.0,52.0,13400.0,183724.0,155913.0,20573.0,2562.0,12.0,32585.0,1445.0,213956.0,323997.0,109514.0,19535.0,454.0,31896.0,7773.0,665154.0,488271.0,300083.0,17265.0,14966985.0,339.563444,166.705583,944.133250,1192.740333,1544.355778,2307.900278,5738.170167,1182.611833,3.367250,0.218806,0.056778,0.058444,202.430194,275.669056,327.877944,385.460472,435.321167,506.110639,1201.040833,1351.811444,15295.0,14041353.0,2720.0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0,Cat0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122447,33643,101.0,0,7511933.0,1135119.0,38280687.0,33976900.0,13523656.0,8283961.0,4051670.0,2515357.0,2825140.0,1849142.0,0.0,843.0,6408272.0,10964376.0,6092347.0,15525934.0,80766335.0,9791612.0,779406.0,51495.0,1115.0,2136810.0,8845257.0,5400374.0,2837147.0,6104766.0,82477919.0,9625552.0,20606.0,0.0,0.0,0.0,11257.0,8718.0,1311.0,6446.0,10870.0,4048.0,2575.0,3992.0,2212.0,1285.0,4829.0,1247.0,0.0,3442580.0,3929032.0,9165.0,721.0,716249.0,1579103.0,3575994.0,46532.0,13504.0,156.0,181206.0,816091.0,3180853.0,1360059.0,36453.0,5535.0,364.0,175361.0,485798.0,727848.0,34448.0,6540.0,36.0,196139.0,19500.0,612343.0,626033.0,100155.0,17033.0,24.0,149316.0,27601.0,5796832.0,2551698.0,5001321.0,59266.0,81068654.0,2098.679611,718.947278,2947.125917,4128.689861,3531.299889,6145.524278,28009.510667,9764.259083,21.436861,0.053333,0.001722,0.001917,676.951028,655.455611,677.717972,765.981389,885.586833,1018.333944,2920.054167,3972.587583,41412.0,84411912.0,10365.0,Cat0,Cat0,Cat2,Cat0,Cat0,Cat0,Cat1,Cat4
1122448,33643,107.0,0,8322863.0,1261239.0,41426931.0,36439732.0,14514208.0,8841361.0,4320470.0,2698033.0,3039016.0,1922498.0,0.0,843.0,6596477.0,12358026.0,6721702.0,16509424.0,86210645.0,10741158.0,878526.0,61755.0,1115.0,2283285.0,9156711.0,6089881.0,3345265.0,6660386.0,88265792.0,10450415.0,33516.0,0.0,0.0,0.0,11491.0,8937.0,1414.0,6719.0,11467.0,4240.0,2722.0,4187.0,2350.0,1408.0,5138.0,1443.0,0.0,3732020.0,4235480.0,9594.0,729.0,742374.0,1681964.0,3832003.0,49409.0,14264.0,156.0,189152.0,878123.0,3424633.0,1452988.0,38977.0,5899.0,369.0,184621.0,523496.0,780576.0,37488.0,6964.0,40.0,209403.0,20484.0,652688.0,670517.0,107367.0,18901.0,24.0,158645.0,28809.0,6329888.0,2715863.0,5358369.0,62579.0,86639835.0,2287.780167,759.823694,3174.656806,4432.690306,3837.950583,6499.862083,29766.340194,10552.246111,22.551750,0.057750,0.001722,0.001917,716.823833,694.721389,721.229861,818.648306,957.363861,1105.365944,3149.444361,4288.340500,44660.0,90774342.0,12317.0,Cat0,Cat0,Cat2,Cat0,Cat0,Cat0,Cat1,Cat4
1122449,33643,113.8,0,8940438.0,1396767.0,44324163.0,39773705.0,15642460.0,9466513.0,4598306.0,2880793.0,3279485.0,2009654.0,0.0,843.0,6669542.0,12830421.0,7415497.0,17665594.0,92761955.0,11979828.0,999681.0,71100.0,4715.0,2462865.0,9390153.0,6322764.0,3648321.0,7385461.0,95478959.0,11384738.0,42292.0,0.0,0.0,0.0,11881.0,9288.0,1534.0,7152.0,12277.0,4603.0,2915.0,4565.0,2503.0,1522.0,5496.0,1569.0,0.0,4017868.0,4613876.0,10246.0,746.0,772018.0,1800240.0,4118391.0,52490.0,14892.0,171.0,198940.0,944447.0,3739396.0,1558764.0,41257.0,6368.0,369.0,195682.0,559928.0,837953.0,40050.0,7524.0,44.0,221316.0,21688.0,698824.0,722453.0,115851.0,21237.0,28.0,171602.0,30077.0,6750433.0,2928980.0,5843417.0,66759.0,93439419.0,2430.931333,811.060000,3425.208667,4769.487028,4216.816250,6910.002028,31886.710083,11497.291944,23.799083,0.064417,0.001722,0.001917,770.974500,746.272472,776.228250,875.340889,1025.653806,1194.859722,3393.209583,4603.340833,48370.0,97839903.0,13910.0,Cat0,Cat0,Cat2,Cat0,Cat0,Cat0,Cat1,Cat4
1122450,33643,119.8,0,9674133.0,1479795.0,47433687.0,42439853.0,16743172.0,10130857.0,4936094.0,3103141.0,3550457.0,2114414.0,0.0,843.0,7074468.0,13604706.0,7747492.0,19035544.0,99046385.0,12961968.0,1096131.0,84180.0,6380.0,2627250.0,9948230.0,6662103.0,3860404.0,7832297.0,102436624.0,12137386.0,48572.0,0.0,0.0,0.0,12754.0,9879.0,1628.0,7641.0,13141.0,4942.0,3089.0,4850.0,2633.0,1636.0,5931.0,1681.0,0.0,4276952.0,4975988.0,10902.0,770.0,820966.0,1934864.0,4390111.0,55886.0,15672.0,175.0,210348.0,1024952.0,4025232.0,1667084.0,43698.0,7028.0,381.0,204614.0,606994.0,898070.0,42866.0,7992.0,48.0,235173.0,22732.0,757856.0,777630.0,123155.0,22357.0,32.0,180618.0,31067.0,7209930.0,3123852.0,6199946.0,71603.0,99682931.0,2618.597806,870.200778,3639.536139,5051.233389,4576.042861,7578.870472,34128.006472,12225.930444,24.648083,0.067500,0.001722,0.001917,828.064333,800.545139,829.958056,934.159111,1099.504528,1275.541861,3596.745472,4912.359278,51101.0,104628494.0,14854.0,Cat0,Cat0,Cat2,Cat0,Cat0,Cat0,Cat1,Cat4


# Output Combined (Raw) Dataset

This dataset is a combination of: 1) operational; 2) specifications; 3) Tte. 

In [25]:
df.to_csv('./data/raw_scania.csv.gz', compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}, index=False)

# Impute Missings

### One-hot encode categorical features

In [26]:
# One-hot encoding of categorical features
df_ = pd.get_dummies(df, drop_first=True, dtype=int)

### Impute missings using groupby forward fill

In [27]:
# Fill missings
cols_missing_data = get_cols_with_missings(df_)
for col in cols_missing_data:  # forward fill, grouped by `vehicle_id`
    df_[col] = df_.groupby('vehicle_id')[col].transform('ffill')

### [MICE Imputation](https://medium.com/@kunalshrm175/multivariate-imputation-by-chained-equations-mice-2d3efb063434) using sklearn's [IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html), with [Bayesian ridige regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html) under-the-hood

In [28]:
imp = IterativeImputer(
    max_iter = 5,
    tol = 1e-10,
    imputation_order = 'roman',
    random_state = 42,
    initial_strategy = 'mean',
    verbose = 2,
    skip_complete=True
)

df_ = pd.DataFrame(imp.fit_transform(df_), index=df_.index, columns=df_.columns)

[IterativeImputer] Completing matrix with shape (1122452, 190)
[IterativeImputer] Ending imputation round 1/5, elapsed time 1147.64
[IterativeImputer] Change: 331846929.4486323, scaled tolerance: 3.4225874055000003 
[IterativeImputer] Ending imputation round 2/5, elapsed time 2262.16
[IterativeImputer] Change: 452899035.4151731, scaled tolerance: 3.4225874055000003 
[IterativeImputer] Ending imputation round 3/5, elapsed time 3365.85
[IterativeImputer] Change: 68021980.05074555, scaled tolerance: 3.4225874055000003 
[IterativeImputer] Ending imputation round 4/5, elapsed time 4446.31
[IterativeImputer] Change: 33423163.95587253, scaled tolerance: 3.4225874055000003 
[IterativeImputer] Ending imputation round 5/5, elapsed time 5548.94
[IterativeImputer] Change: 26982790.090393487, scaled tolerance: 3.4225874055000003 




### Remove on-hot encoded cols and re-attach original columns

In [11]:
non_cat_cols = [i for i in df_.columns if "Spec" not in i]
cat_cols = [i for i in df.columns if "Spec" in i]

In [12]:
imputed = pd.concat([df_[non_cat_cols], df[cat_cols]], axis=1)

In [14]:
imputed.isnull().mean().mean()

np.float64(0.0)

In [15]:
imputed.shape

(1122452, 116)

# Output Imputed (Raw) Dataset

These data are still raw data, no data transformations have been applied

In [16]:
imputed.to_csv('./data/imputed_scania.csv.gz', compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}, index=False)