In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import pyarrow.parquet as pq
import dask.dataframe as dd
import matplotlib.pyplot as plt
from IPython.display import display

## metadata

In [2]:
train_meta = pd.read_csv("../data/input/metadata_train.csv")
test_meta = pd.read_csv("../data/input/metadata_test.csv")
train_meta.shape, test_meta.shape

((8712, 4), (20337, 3))

In [3]:
display(train_meta.head())
display(test_meta.head())

Unnamed: 0,signal_id,id_measurement,phase,target
0,0,0,0,0
1,1,0,1,0
2,2,0,2,0
3,3,1,0,1
4,4,1,1,1


Unnamed: 0,signal_id,id_measurement,phase
0,8712,2904,0
1,8713,2904,1
2,8714,2904,2
3,8715,2905,0
4,8716,2905,1


### signal_id

In [4]:
print(train_meta.signal_id.min(), train_meta.signal_id.max(), train_meta.signal_id.nunique())
print(test_meta.signal_id.min(), test_meta.signal_id.max(), test_meta.signal_id.nunique())

0 8711 8712
8712 29048 20337


### id_measurement

In [5]:
print(train_meta.id_measurement.min(), train_meta.id_measurement.max(), train_meta.id_measurement.nunique())
print(test_meta.id_measurement.min(), test_meta.id_measurement.max(), test_meta.id_measurement.nunique())

0 2903 2904
2904 9682 6779


In [6]:
train_grp = train_meta.groupby("id_measurement").size()
test_grp = test_meta.groupby("id_measurement").size()
print(train_grp.min(), train_grp.max(), train_grp.std())
print(test_grp.min(), test_grp.max(), test_grp.std())

3 3 0.0
3 3 0.0


### phase

In [7]:
print(train_meta.phase.min(), train_meta.phase.max(), train_meta.phase.nunique(), train_meta.phase.unique())
print(test_meta.phase.min(), test_meta.phase.max(), test_meta.phase.nunique(), test_meta.phase.unique())

0 2 3 [0 1 2]
0 2 3 [0 1 2]


In [8]:
train_grp = train_meta.groupby(["id_measurement", "phase"]).size()
test_grp = test_meta.groupby(["id_measurement", "phase"]).size()
print(train_grp.min(), train_grp.max(), train_grp.std())
print(test_grp.min(), test_grp.max(), test_grp.std())

1 1 0.0
1 1 0.0


### target

- targetは、signal_idに対して一意（3つのphase全てでfaultするとは限らない）
- 0: undamaged, 1:fault

In [9]:
display(train_meta.groupby("target").size())
print("ratio:", train_meta.query("target == 1").shape[0] / len(train_meta))

target
0    8187
1     525
dtype: int64

ratio: 0.060261707988980714


In [10]:
grp_result = train_meta.groupby(["phase", "target"]).size().unstack()
grp_result["ratio"] = grp_result[1] / (grp_result[1] + grp_result[0])
grp_result

target,0,1,ratio
phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2726,178,0.061295
1,2738,166,0.057163
2,2723,181,0.062328


In [11]:
# 1measureにつき、faultした回数（３は、全てのphaseでfaultしたということ）
grp_result = train_meta.groupby("id_measurement")["target"].sum()
grp_result.reset_index().groupby("target").size()

target
0    2710
1      19
2      19
3     156
dtype: int64

In [12]:
grp_result[grp_result == 3].head(20)

id_measurement
1      3
76     3
90     3
93     3
95     3
98     3
136    3
144    3
145    3
152    3
172    3
190    3
226    3
233    3
235    3
236    3
292    3
304    3
313    3
334    3
Name: target, dtype: int64

In [13]:
grp_result[grp_result == 2].head(20)

id_measurement
67      2
601     2
608     2
620     2
706     2
944     2
988     2
1068    2
1076    2
1256    2
1304    2
1537    2
1668    2
1704    2
2328    2
2623    2
2693    2
2760    2
2807    2
Name: target, dtype: int64

In [14]:
grp_result[grp_result == 1].head(20)

id_measurement
96      1
126     1
159     1
271     1
301     1
443     1
518     1
894     1
1091    1
1132    1
1268    1
1277    1
1420    1
1561    1
1884    1
1899    1
1994    1
2753    1
2876    1
Name: target, dtype: int64

## submission file

In [15]:
sub = pd.read_csv("../data/input/sample_submission.csv")
sub.shape

(20337, 2)

In [16]:
sub.head()

Unnamed: 0,signal_id,target
0,8712,0
1,8713,0
2,8714,0
3,8715,0
4,8716,0


## time-series data

In [57]:
%%time
n_cpu = multiprocessing.cpu_count()
train_ts = pq.ParquetDataset('../data/input/train.parquet').read(nthreads=n_cpu).to_pandas().transpose()
test_ts =  pq.ParquetDataset('../data/input/test.parquet').read(nthreads=n_cpu).to_pandas().transpose()
train_ts.shape, test_ts.shape

CPU times: user 2min 38s, sys: 1min, total: 3min 39s
Wall time: 1min 40s


In [20]:
train_ts.columns, test_ts.columns

(RangeIndex(start=0, stop=800000, step=1),
 RangeIndex(start=0, stop=800000, step=1))

In [21]:
train_ts.index, test_ts.index

(Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
        ...
        '8702', '8703', '8704', '8705', '8706', '8707', '8708', '8709', '8710',
        '8711'],
       dtype='object', length=8712),
 Index(['8712', '8713', '8714', '8715', '8716', '8717', '8718', '8719', '8720',
        '8721',
        ...
        '29039', '29040', '29041', '29042', '29043', '29044', '29045', '29046',
        '29047', '29048'],
       dtype='object', length=20337))

In [44]:
pd.Series([int(i) for i in train_ts.index.tolist()]).diff().dropna().min()

1.0

In [45]:
pd.Series([int(i) for i in test_ts.index.tolist()]).diff().dropna().min()

1.0

In [46]:
train_meta.index, test_meta.index

(RangeIndex(start=0, stop=8712, step=1),
 RangeIndex(start=0, stop=20337, step=1))

In [58]:
train_ts.index = range(0, len(train_ts))
test_ts.index = range(0, len(test_ts))
train_ts.index, test_ts.index

(RangeIndex(start=0, stop=8712, step=1),
 RangeIndex(start=0, stop=20337, step=1))

In [65]:
# ユニークかどうか
train_ts.index.is_unique, test_ts.index.is_unique

(True, True)

In [64]:
# 昇順でソートされているか
train_ts.index.is_monotonic_increasing, test_ts.index.is_monotonic_increasing

(True, True)

In [48]:
train_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8712 entries, 0 to 8711
Columns: 800000 entries, 0 to 799999
dtypes: int8(800000)
memory usage: 6.5 GB


In [49]:
test_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20337 entries, 0 to 20336
Columns: 800000 entries, 0 to 799999
dtypes: int8(800000)
memory usage: 15.2 GB


In [50]:
train_ts.max().max(), train_ts.min().min()

(127, -128)

In [51]:
test_ts.max().max(), test_ts.min().min()

(127, -128)