In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from json import JSONDecoder, JSONDecodeError  # for reading the JSON data files
import re  # for regular expressions
import os  # for os related operations

import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm


from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from collections import Counter

from datetime import datetime

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 199)


path_to_data = "../../data/raw/"

In [2]:
print(os.listdir(path_to_data))

['data_', 'fold1Training.json', 'fold2Training.json', 'fold3Training.json', 'testSet.json']


In [3]:
def decode_obj(line, pos=0, decoder=JSONDecoder()):
    no_white_space_regex = re.compile(r'[^\s]')
    while True:
        match = no_white_space_regex.search(line, pos)
        if not match:
            return
        pos = match.start()
        try:
            obj, pos = decoder.raw_decode(line, pos)
        except JSONDecodeError as err:
            print('Oops! something went wrong. Error: {}'.format(err))
        yield obj

In [4]:
def get_obj_with_last_n_val(line, n):
    obj = next(decode_obj(line))  # type:dict
    id = obj['id']
    try:
        class_label = obj['classNum']
        data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
        data.set_index(data.index.astype(int), inplace=True)
        last_n_indices = np.arange(0, 60)[-n:]
        data = data.loc[last_n_indices]
        return {'id': id, 'classType': class_label, 'values': data}
    except:
        data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
        data.set_index(data.index.astype(int), inplace=True)
        last_n_indices = np.arange(0, 60)[-n:]
        data = data.loc[last_n_indices]
        return {'id': id, 'values': data}

In [10]:
def convert_json_data_to_csv_2(data_dir: str, file_name: str, having_class_type=True, n_last=60):
    """
    Generates a dataframe by concatenating the last values of each
    multi-variate time series. This method is designed as an example
    to show how a json object can be converted into a csv file.
    :param data_dir: the path to the data directory.
    :param file_name: name of the file to be read, with the extension.
    :return: the generated dataframe.
    """
    fname = os.path.join(data_dir, file_name)

    all_df, labels, ids = [], [], []
    total = len(open(fname).readlines())
    print("processing {} with {} lines".format(fname, total))
    with open(fname, 'r') as infile: # Open the file for reading
        with tqdm(total=total) as pbar:
            for line in infile:  # Each 'line' is one MVTS with its single label (0 or 1).
                obj = get_obj_with_last_n_val(line, n_last)
                all_df.append(obj['values'])
                if having_class_type:
                    labels.append(obj['classType'])
                ids.append(obj['id'])
                pbar.update(1)
                
    df = pd.concat(all_df).reset_index(drop=True)
    tmp_list = []
    for i in range(n_last):
        index_i = [j% n_last == i for j in range(len(df))]
        tmp = df.iloc[index_i].reset_index(drop=True)
        tmp.columns = [c + "_" + str(i) for c in list(df.columns.values)]
        tmp_list.append(tmp)

    tmp_df = pd.concat(tmp_list, 1).reset_index(drop=True)
    tmp_df = tmp_df.assign(LABEL=pd.Series(labels))
    tmp_df = tmp_df.assign(ID=pd.Series(ids))
    
    return tmp_df

In [None]:
fold1 = "fold1Training.json"
fold2 = "fold2Training.json"
fold3 = "fold3Training.json"
test_data = 'testSet.json'

In [None]:
## create data, each 25-variable point is in 1 row (i.e., 1 id takes 60 rows)
df1 = convert_json_data_to_csv_to_many_rows(path_to_data, fold1, n_last = 60) 
df2 = convert_json_data_to_csv_to_many_rows(path_to_data, fold2, n_last = 60)
df3 = convert_json_data_to_csv_to_many_rows(path_to_data, fold3, n_last = 60)
test_data = convert_json_data_to_csv_to_many_rows(path_to_data, test_data, having_class_type=False, n_last = 60)


print('df1.shape = {}'.format(df1.shape))
print('df2.shape = {}'.format(df2.shape))
print('df3.shape = {}'.format(df3.shape))
print('test_data.shape = {}'.format(test_data.shape))

df1.to_csv("../../data/processed/fold1Training_many_rows_raw.csv", index=False)
df2.to_csv("../../data/processed/fold2Training_many_rows_raw.csv", index=False)
df3.to_csv("../../data/processed/fold3Training_many_rows_raw.csv", index=False)
test_data.to_csv("../../data/processed/testSet_many_rows_raw.csv", index=False)

# df1.shape = (4606380, 27)
# df2.shape = (5548860, 27)
# df3.shape = (1620360, 27)
# test_data.shape = (10410720, 27)

In [12]:
## create data, all data from a ID is in 1 row
df1 = convert_json_data_to_csv_2(path_to_data, fold1, n_last = 60) 
df2 = convert_json_data_to_csv_2(path_to_data, fold2, n_last = 60)
df3 = convert_json_data_to_csv_2(path_to_data, fold3, n_last = 60)
test_data = convert_json_data_to_csv_2(path_to_data, test_data, having_class_type=False, n_last = 60)


print('df1.shape = {}'.format(df1.shape))
print('df2.shape = {}'.format(df2.shape))
print('df3.shape = {}'.format(df3.shape))
print('test_data.shape = {}'.format(test_data.shape))

df1.to_csv("../../data/processed/fold1Training_60.csv", index=False)
df2.to_csv("../../data/processed/fold2Training_60.csv", index=False)
df3.to_csv("../../data/processed/fold3Training_60.csv", index=False)
test_data.to_csv("../../data/processed/testSet_60.csv", index=False)


processing ../../data/raw/fold1Training.json with 76773 lines


HBox(children=(IntProgress(value=0, max=76773), HTML(value='')))


processing ../../data/raw/fold2Training.json with 92481 lines


HBox(children=(IntProgress(value=0, max=92481), HTML(value='')))


processing ../../data/raw/fold3Training.json with 27006 lines


HBox(children=(IntProgress(value=0, max=27006), HTML(value='')))


processing ../../data/raw/testSet.json with 173512 lines


HBox(children=(IntProgress(value=0, max=173512), HTML(value='')))


df1.shape = (76773, 1502)
df2.shape = (92481, 1502)
df3.shape = (27006, 1502)
test_data.shape = (173512, 1502)


In [17]:
c1 = Counter(df1.LABEL)
c2 = Counter(df2.LABEL)
c3 = Counter(df3.LABEL)

print("df1", df1.shape, c1, c1[1]*100/c1[0])
print("df2", df2.shape,c2, c2[1]*100/c2[0])
print("df3", df3.shape,c3, c3[1]*100/c3[0])

df1 (76773, 1502) Counter({0: 64222, 1: 12551}) 19.543147208121827
df2 (92481, 1502) Counter({0: 78516, 1: 13965}) 17.786183707779305
df3 (27006, 1502) Counter({0: 22236, 1: 4770}) 21.45169994603346


In [18]:
train = pd.concat([df1, df2]).reset_index(drop=True)
valid = df3

c_train = Counter(train.LABEL)
c_valid = Counter(valid.LABEL)
print("c_train", train.shape, c_train, c_train[1]*100/c_train[0])
print("c_valid", valid.shape, c_valid, c_valid[1]*100/c_valid[0])

c_train (169254, 1502) Counter({0: 142738, 1: 26516}) 18.57669296193025
c_valid (27006, 1502) Counter({0: 22236, 1: 4770}) 21.45169994603346
