In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from json import JSONDecoder, JSONDecodeError  # for reading the JSON data files
import re  # for regular expressions
import os  # for os related operations

import lightgbm as lgb

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from collections import Counter
path_to_data = "../../data/raw/"

In [24]:
print(os.listdir(path_to_data))

['data_', 'fold1Training.json', 'fold2Training.json', 'fold3Training.json', 'testSet.json']


In [4]:
def decode_obj(line, pos=0, decoder=JSONDecoder()):
    no_white_space_regex = re.compile(r'[^\s]')
    while True:
        match = no_white_space_regex.search(line, pos)
        if not match:
            return
        pos = match.start()
        try:
            obj, pos = decoder.raw_decode(line, pos)
        except JSONDecodeError as err:
            print('Oops! something went wrong. Error: {}'.format(err))
        yield obj

In [5]:
def get_obj_with_last_n_val(line, n):
    obj = next(decode_obj(line))  # type:dict
    id = obj['id']
    try:
        class_label = obj['classNum']
        data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
        data.set_index(data.index.astype(int), inplace=True)
        last_n_indices = np.arange(0, 60)[-n:]
        data = data.loc[last_n_indices]
        return {'id': id, 'classType': class_label, 'values': data}
    except:
        data = pd.DataFrame.from_dict(obj['values'])  # type:pd.DataFrame
        data.set_index(data.index.astype(int), inplace=True)
        last_n_indices = np.arange(0, 60)[-n:]
        data = data.loc[last_n_indices]
        return {'id': id, 'values': data}

In [6]:
def convert_json_data_to_csv(data_dir: str, file_name: str, having_class_type=True):
    """
    Generates a dataframe by concatenating the last values of each
    multi-variate time series. This method is designed as an example
    to show how a json object can be converted into a csv file.
    :param data_dir: the path to the data directory.
    :param file_name: name of the file to be read, with the extension.
    :return: the generated dataframe.
    """
    fname = os.path.join(data_dir, file_name)

    all_df, labels, ids = [], [], []
    with open(fname, 'r') as infile: # Open the file for reading
        for line in infile:  # Each 'line' is one MVTS with its single label (0 or 1).
            obj = get_obj_with_last_n_val(line, 1)
            all_df.append(obj['values'])
            if having_class_type:
                labels.append(obj['classType'])
            ids.append(obj['id'])

    df = pd.concat(all_df).reset_index(drop=True)
    df = df.assign(LABEL=pd.Series(labels))
    df = df.assign(ID=pd.Series(ids))
    df.set_index([pd.Index(ids)])
    # Uncomment if you want to save this as CSV
    # df.to_csv(file_name + '_last_vals.csv', index=False)
    return df

In [11]:
file_name = "fold3Training.json"

df = convert_json_data_to_csv(path_to_data, file_name)  # shape: 27006 X 27
print('df.shape = {}'.format(df.shape))
# print(list(df))

df.shape = (27006, 27)


In [12]:
df.head()

Unnamed: 0,TOTUSJH,TOTBSQ,TOTPOT,TOTUSJZ,ABSNJZH,SAVNCPP,USFLUX,TOTFZ,MEANPOT,EPSZ,...,TOTFY,MEANJZD,MEANALP,TOTFX,EPSY,EPSX,R_VALUE,XR_MAX,LABEL,ID
0,2279.058608,41769100000.0,6.722922e+23,41514450000000.0,298.753182,14438310000000.0,4.25366e+22,-1.714706e+25,8492.605611,-0.309116,...,-9.851233e+23,-0.063377,0.010927,5.139045e+24,0.017759,-0.092643,4.961807,9.983e-07,1,1
1,324.136602,3044442000.0,1.842963e+22,7596014000000.0,64.312903,3644793000000.0,6.458115e+21,-2.912557e+24,1274.079337,-0.720368,...,1.059818e+23,-0.255003,0.024558,9.967036e+23,-0.026213,-0.246516,2.1485,1.639e-06,0,2
2,90.928971,641875900.0,5.420498e+21,1975487000000.0,0.886584,242710200000.0,1.151176e+21,-4.688949e+23,2220.655208,-0.550062,...,-1.457803e+23,0.256968,0.001782,2.034861e+23,0.171015,-0.23871,0.0,7.8871e-07,0,3
3,173.008586,2210899000.0,2.42231e+22,3389141000000.0,10.262131,588624700000.0,2.174629e+21,-9.750141e+23,5685.685977,-0.33207,...,-1.981958e+23,0.358722,-0.006969,3.362239e+22,0.067501,-0.011451,0.0,6.3656e-07,0,4
4,56.286406,381408900.0,2.659824e+21,1210523000000.0,8.744935,455853000000.0,5.944778e+20,-2.32126e+23,2002.81302,-0.458269,...,-1.028025e+23,0.881417,0.031445,-6.666904e+22,0.202955,0.13162,2.438045,7.7562e-07,0,5


In [13]:
df.LABEL.value_counts()

0    22236
1     4770
Name: LABEL, dtype: int64

In [14]:
df = df.dropna()  # shape: 26666 X 27
print('df.shape = {}'.format(df.shape))

df.shape = (26666, 27)


In [15]:
t = (2/3) * df.shape[0]
df_train = df[df['ID'] <= t]  # shape: 18004 X 27
df_val = df[df['ID'] > t]  # shape: 9002 X 27
print('df_train.shape = {}'.format(df_train.shape))
print('df_val.shape = {}'.format(df_val.shape))

df_train.shape = (17556, 27)
df_val.shape = (9110, 27)


In [17]:
# Separate values and labels columns
df_train_data = df_train.iloc[:, :-2]  # all columns excluding 'ID' and 'LABEL'
df_train_labels = pd.DataFrame(df_train.LABEL)  # only 'LABEL' column

df_val_data = df_val.iloc[:, :-2]  # all columns excluding 'ID' and 'LABEL'
df_val_labels = pd.DataFrame(df_val.LABEL)  # only 'LABEL' column

In [18]:
X_train, y_train  = df_train_data,np.ravel(df_train_labels)
X_valid, y_valid  = df_val_data,np.ravel(df_val_labels)


In [36]:
clf = lgb.LGBMClassifier(n_jobs = 8)
clf

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=8, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [37]:
clf.fit(X=X_train, y=y_train, verbose=10)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=8, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [44]:
# Test the model against the validation set
pred_val = clf.predict(df_val_data)

# Evaluate the predictions
scores = confusion_matrix(df_val_labels, pred_val).ravel()
tn, fp, fn, tp = scores
print('TN:{}\tFP:{}\tFN:{}\tTP:{}'.format(tn, fp, fn, tp))
f1 = f1_score(df_val_labels, pred_val, average='binary', labels=[0, 1])
print('f1-score = {}'.format(f1))

TN:7385	FP:114	FN:159	TP:1452
f1-score = 0.9140698772426818


In [25]:
file_name = "testSet.json"

df_test_data = convert_json_data_to_csv(path_to_data, file_name, False)  # shape: 27006 X 27
df_test_data.head()

Unnamed: 0,TOTUSJH,TOTBSQ,TOTPOT,TOTUSJZ,ABSNJZH,SAVNCPP,USFLUX,TOTFZ,MEANPOT,EPSZ,...,TOTFY,MEANJZD,MEANALP,TOTFX,EPSY,EPSX,R_VALUE,XR_MAX,LABEL,ID
0,87.732673,1060253000.0,4.965436e+21,1757834000000.0,5.639933,513042400000.0,1.992729e+21,-1.02767e+24,1258.044246,-0.729846,...,5.382202e+22,-0.161822,0.00615,2.988294e+23,-0.038224,-0.212227,0.0,2.2138e-07,,1
1,86.971296,821393000.0,6.515581e+21,1736808000000.0,3.810905,153292200000.0,1.261756e+21,-6.059452e+23,2496.767332,-0.555482,...,-1.047171e+23,-0.000818,-0.005965,-2.519539e+23,0.095996,0.230971,0.0,8.9551e-08,,2
2,28.729991,244112200.0,1.753274e+21,648821800000.0,3.915116,372670800000.0,4.801788e+20,-2.015783e+23,1686.069536,-0.621787,...,-7.54867e+22,0.881503,0.019778,-5.513476e+22,0.232846,0.170068,0.0,4.3783e-07,,3
3,326.870514,3694160000.0,2.77616e+22,7087296000000.0,8.820615,916046800000.0,6.285037e+21,-3.009802e+24,2033.475382,-0.613493,...,4.302972e+23,-0.174427,-0.00296,-9.818457e+23,-0.087708,0.200131,2.698849,1.0302e-07,,4
4,84.242295,600034200.0,3.23369e+21,1625718000000.0,7.943453,590278900000.0,9.452542e+20,-3.837011e+23,1712.3244,-0.481509,...,-7.405174e+22,0.767632,0.017871,7.043063e+21,0.092928,-0.008838,0.0,7.5515e-07,,5


In [26]:
pred_labels = clf.predict(df_test_data.iloc[:, :-2])
pred_labels

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
submission = pd.DataFrame({"Id": df_test_data.ID, "ClassLabel": list(pred_labels)})
submission.ClassLabel.value_counts()

0    159340
1     14172
Name: ClassLabel, dtype: int64

In [41]:
submission.to_csv("../../data/output/baseline.csv", index=False)
submission.head(10)

Unnamed: 0,Id,ClassLabel
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


In [None]:
# baseline_1 = pd.read_csv("../../data/output/baseline.csv")
# baseline_2 = pd.read_csv("../../data/output/baseline.csv")