In [1]:
import os, shutil, glob
import ujson as json
import pandas as pd
import csv
import timeit
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib.style as style
import seaborn as sns

In [2]:
path = 'C:/Users/SouthSystem/Documents/Pessoal/TCC/Impl/sensors_data/'
users_path = [ f.path for f in os.scandir(path) if f.is_dir() ]
screens = ['Focus', 'Mathisis', 'Memoria', 'Reacton', 'Speedy']
screens_code = ['1', '2', '3', '4', '5']

In [3]:
list_dir = os.listdir(path)
users_list = []
for sub_dir in list_dir:
    users_list.append(sub_dir)

In [149]:
def convert_to_csv(signal, path, users):
    users_processed = 0
    for i in range(0, len(users)):
        users_processed += 1
        print('Progress: {}/{} users processed'.format(users_processed, len(users)))

        json_files = [pos_json for pos_json in os.listdir(users_path[i]) if pos_json.endswith('.json')]
        
        data_signal = pd.DataFrame(columns=['x', 'y', 'z', 'screen', 'player_id', 'timestamp'])

        for file in json_files:
            js = file.replace('.json','')
            arr = js.split('_')

            with open(users_path[i] + "/" + file,'r') as f:
                data = json.loads(f.read())

            df = pd.json_normalize(data, record_path =[path],
                meta=['player_id']
            )
            df['timestamp'] = arr[1]          
            
            data_signal = data_signal.append(df, ignore_index=True)
            
        x_signal = f'x_{signal}'
        y_signal = f'y_{signal}'
        z_signal = f'z_{signal}'    

        new = [x_signal, y_signal, z_signal, 'screen', 'player_id', 'timestamp']
        new_df = pd.DataFrame(data_signal.values, data_signal.index, new)
        
        filter_values = new_df['screen'].str.contains('|'.join(screens),regex=True)
        new_df_filter = new_df[filter_values]
        saving_directory = f'data/{signal}/{arr[0]}_{signal}.csv'
        new_df_filter.to_csv(saving_directory, index=False)

In [None]:
convert_to_csv(signal='accel', path='accelerometer', users=users_list)

In [None]:
convert_to_csv(signal='gyro', path='gyroscope', users=users_list)

In [106]:
def statistical_feature_extraction(window_size, signal, axis, subject_ID):
    
    start_running = timeit.default_timer()
    
    try:
        directory = f'data/{signal}/{subject_ID}_{signal}.csv'
        sampling_rate = 20
        window_size = int(sampling_rate * window_size)
        # print(window_size)
    except:
        print('Error! Can not find such directory.')

    raw_signal = pd.read_csv(directory)

    for idx, val in enumerate(screens):
        raw_signal.loc[raw_signal.screen.str.contains(screens[idx]), 'screen'] = screens_code[idx]

    win_count = 0
    total_win_count = 0
    features_for_all_windows_one_activity = []
    features_for_all_windows_all_activities = []
    column_title = f'{axis}_{signal}'
    range_screen = range(1, 6)

    for class_label in range_screen:
        screen_ID = screens_code[class_label - 1]
        #print(screen_ID)
        raw_data_one_activity = np.array(raw_signal.loc[raw_signal['screen'] == screen_ID, [column_title]])
        raw_data_one_activity = pd.DataFrame(raw_data_one_activity)

        for data_point in range(0, len(raw_data_one_activity), window_size):        
            win_count += 1
            start = data_point
            end = start + window_size
            time_domain_window = raw_data_one_activity[start:end]

            time_mean = pd.Series(time_domain_window.mean()).rename(f'{axis}_{signal}_mean')
            time_min = pd.Series(time_domain_window.min()).rename(f'{axis}_{signal}_min')
            time_max = pd.Series(time_domain_window.max()).rename(f'{axis}_{signal}_max')
            time_std = pd.Series(time_domain_window.std()).rename(f'{axis}_{signal}_std')
            time_median = pd.Series(time_domain_window.median()).rename(f'{axis}_{signal}_median')
            time_variance = pd.Series(time_domain_window.var()).rename(f'{axis}_{signal}_variance')
            zero_crossing_rate = pd.Series(zero_crossing(time_domain_window)).rename(
                f'{axis}_{signal}_zero_crossing')
            mean_crossing = pd.Series(mean_crossing_rate(time_domain_window)).rename(
                f'{axis}_{signal}_mean_crossing')
            #print(screen_id_)

            features_for_one_window_one_activity = pd.concat(
                [time_mean, time_min, time_max, time_std, time_median, time_variance, zero_crossing_rate, mean_crossing,
                 screen_id_], axis=1)
            features_for_all_windows_one_activity.append(features_for_one_window_one_activity)        

        print('Window count', win_count)
        total_win_count += win_count
        win_count = 0
        features_for_all_windows_all_activities.append(features_for_all_windows_one_activity)
    features = pd.concat(features_for_all_windows_all_activities[0], ignore_index=False)
    #print(features)
    save_as_directory = f'feature_label_tables/feature_{signal}/feature_{subject_ID}_{axis}_{signal}.csv'
    features.to_csv(save_as_directory, encoding='utf-8', index=False)
    finish_running = timeit.default_timer()
    print('Total number of windows: ', total_win_count)
    print('Running time: ', finish_running - start_running)
    
def feature_extraction_for_all_subjects():
    signal_list = ['accel', 'gyro']
    axis_list = ['x', 'y', 'z']

    for signal in signal_list:
        for axis in axis_list:
            for subject_ID in users_list:
                print('calculating: ', signal, axis, subject_ID)
                print('==============================================')
                statistical_feature_extraction(window_size=10, signal=signal, axis=axis,
                                               subject_ID=subject_ID)


# feature_extraction_for_all_subjects()

In [6]:
def combine_all_data():
    signal_list = ['accel', 'gyro']
    data_signal = pd.DataFrame(columns=['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro', 'screen', 'player_id', 'timestamp'])
        
    for subject_ID in users_list:
        print(subject_ID)
        df_accel = pd.read_csv(f'data/accel/{subject_ID}_accel.csv')
        df_gyro = pd.read_csv(f'data/gyro/{subject_ID}_gyro.csv')

        for idx, val in enumerate(screens):
            df_accel.loc[df_accel.screen.str.contains(screens[idx]), 'screen'] = screens_code[idx]
            df_gyro.loc[df_gyro.screen.str.contains(screens[idx]), 'screen'] = screens_code[idx]

        time_dfs=[]
        for time in df_accel.timestamp.unique():
            sub_df1 = df_accel.loc[df_accel.timestamp == time, ('x_accel', 'y_accel', 'z_accel', 'screen', 'player_id')].reset_index(drop=True)
            sub_df2 = df_gyro.loc[df_gyro.timestamp == time, ('x_gyro', 'y_gyro', 'z_gyro', 'screen', 'player_id')].reset_index(drop=True)
            concat_df = pd.concat([sub_df1, sub_df2], axis=1)
            concat_df["timestamp"] = time
            time_dfs.append(concat_df)
        df = pd.concat(time_dfs).reset_index(drop=True)   
        df = df.loc[:,~df.columns.duplicated()]
        df = df.dropna()       
        
        data_signal = data_signal.append(df, ignore_index=True)
            
    save_as_directory = 'data/data_all.csv'
    data_signal.to_csv(save_as_directory, encoding='utf-8', index=False)


In [None]:
combine_all_data()

In [125]:
def merge_features(signal, subject_ID):
    
    df_x = pd.read_csv(f'feature_label_tables/feature_{signal}/feature_{subject_ID}_x_{signal}.csv')
    df_y = pd.read_csv(f'feature_label_tables/feature_{signal}/feature_{subject_ID}_y_{signal}.csv')
    df_z = pd.read_csv(f'feature_label_tables/feature_{signal}/feature_{subject_ID}_z_{signal}.csv')
    
    df = pd.concat([df_x, df_y, df_z], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    s = df.pop('Screen_ID')
    merged_df = pd.concat([df, s], 1)
    
    save_as_directory = f'feature_label_tables/feature_{signal}/feature_{subject_ID}_all_axis_{signal}.csv'
    merged_df.to_csv(save_as_directory, encoding='utf-8', index=False)
    
def merge_features_for_all_subjects():
    signal_list = ['accel', 'gyro']
    
    for signal in signal_list:
        for subject_ID in users_list:
            merge_features(signal, subject_ID)

In [139]:
def input_features_labels(signal, subject_ID):
    
    directory = f'feature_label_tables/feature_{signal}/feature_{subject_ID}_all_axis_{signal}.csv'
    data = pd.read_csv(directory)
    data = data.dropna()
    
    features = data.drop(columns=[f'x_{signal}_zero_crossing', f'x_{signal}_mean_crossing',
                              f'y_{signal}_zero_crossing', f'y_{signal}_mean_crossing',
                              f'z_{signal}_zero_crossing', f'z_{signal}_mean_crossing',
                              'Screen_ID'])
    
    all_labels = data[['Screen_ID']]
    
    feature_train, feature_test, label_train, label_test = train_test_split(
    features, all_labels, test_size=0.2, shuffle=True)

    # feature normalization
    scalar = StandardScaler().fit(feature_train)
    normalized_feature_train = scalar.transform(feature_train)
    normalized_feature_test = scalar.transform(feature_test)
    normalized_all_feature = scalar.transform(features)
    # convert 'numpy.ndarray' to pandas dataframe
    normalized_feature_train = pd.DataFrame(normalized_feature_train)
    normalized_feature_test = pd.DataFrame(normalized_feature_test)
    normalized_all_feature = pd.DataFrame(normalized_all_feature)

    return normalized_feature_train, normalized_feature_test, label_train, label_test, normalized_all_feature, all_labels


In [None]:
feature_extraction_for_all_subjects()
merge_features_for_all_subjects()

In [None]:
FEATURE_COLS = [
    "x_accel",
    "y_accel",
    "z_accel",
    "x_gyro",
    "y_gyro",
    "z_gyro"
]

RENAME_COLS = [
    "acc_x",
    "acc_y",
    "acc_z",
    "gyr_x",
    "gyr_y",
    "gyr_z",
]

df_temp = data_all[FEATURE_COLS]
df_temp.columns = [RENAME_COLS[0], RENAME_COLS[1], RENAME_COLS[2], RENAME_COLS[3], RENAME_COLS[4], RENAME_COLS[5]]
df_temp.columns = ["$" + c.capitalize()  + "$" for c in df_temp.columns]

f, axes = plt.subplots(3, 2, sharex="col", sharey="col", dpi=300, figsize=(8, 2))
f.subplots_adjust(hspace=1.2, wspace=0.2)
cmap = cm.get_cmap("tab10")

for i, col in enumerate(df_temp.columns):
    plot_column = int(i // 3)
    plot_row = i - plot_column * 3

    g = sns.distplot(
        df_temp[col],
        kde=False,
        ax=axes[plot_row][plot_column],
        color=cmap(plot_column),
        hist_kws=dict(alpha=1),
    )
    g.set_title(f"{col}")
    g.set_yscale("log")
    g.axes.set_xlabel("")

In [145]:
feature_train, feature_test, label_train, label_test, _, _ = input_features_labels('accel', '06mdn3c')

In [56]:
def zero_crossing(window):
    """
    :param window: specific window of the row dataset that we want to calculate zero_crossing for it.
    :return: an integer representing the zero crossing rate
    """
    file_sign = np.sign(window)
    file_sign[file_sign == 0] = -1
    zero_crossing = np.where(np.diff(file_sign))[0]
    return len(zero_crossing)

def mean_crossing_rate(window):
    """
    :param window: specific window of the row dataset that we want to calculate mean_crossing for it.
    :return: an integer representing the mean crossing rate
    """
    mean_crossing_counter = 0
    mean = window.mean()
    subtraction = window - mean
    file_sign = np.sign(subtraction)
    for i in range(len(file_sign)):
        if (file_sign.iloc[i]).all() == 1:
            mean_crossing_counter += 1
    return mean_crossing_counter