In [11]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from collections import Counter
import datetime
import sys
import logging

In [12]:
def init_database():
    if (os.path.exists('../../data/database') is not True):
        os.mkdir('../../data/database')
    files = os.listdir('../../data/database')
    if 'tester_info.csv' in files:
        global all_tester_info
        all_tester_info = pd.DataFrame.from_csv('../../data/database/tester_info.csv')
    if 'sensor_data.csv' in files:
        global all_sensor_df
        all_sensor_df = pd.DataFrame.from_csv('../../data/database/sensor_data.csv')

In [13]:
useful_sensor = [1, 2, 3, 4, 11, 26, 17, 9, 10]
min_samples = 100
all_tester_info = None
all_sensor_df = None
init_database()

In [14]:
def count_fequency(df):
    # only select the useful information
    df = df[df.SENSORTYPE.isin(useful_sensor)]
    min_time = df.TIMESTAMP.min()
    max_time = df.TIMESTAMP.max()
    c = Counter(df.SENSORTYPE)
    print ("Count per 1 second")
    for i in c.keys():
        c[i] = c[i] / (float(max_time - min_time) /1000.0)

    print(c)

In [15]:
def get_new_tester_id():
    if all_tester_info is None:
        return int(0)
    else:
        return int(all_tester_info.index.max() + 1)

In [16]:
def save_user_info_into_database(tag_df):
    global all_tester_info
    
    cur_user_info = pd.DataFrame([tag_df.iloc[0].tolist()[2:]], columns=tag_df.columns[2:].values.tolist(), index=[0])
    cur_user_info['start_time'] = pd.Series([tag_df.TimeStamp.min()])
    cur_user_info['end_time'] = pd.Series([tag_df.TimeStamp.max() + pd.Timedelta('9 seconds')])
    
    if all_tester_info is None:
        cur_user_info.to_csv("../../data/database/tester_info.csv")
        all_tester_info = pd.DataFrame.from_csv('../../data/database/tester_info.csv')
    else:
        all_tester_info = all_tester_info.append(cur_user_info, ignore_index = True)
        all_tester_info.to_csv('../../data/database/tester_info.csv')


In [17]:
def save_sensor_data_into_database(sensor_df):
    global all_sensor_df
    
    cur_sensor_df = sensor_df[~sensor_df.TagName.isnull()]
    
    if all_sensor_df is None:
        cur_sensor_df.to_csv('../../data/database/sensor_data.csv')
        all_sensor_df = pd.DataFrame.from_csv('../../data/database/sensor_data.csv')
    else:
        all_sensor_df = all_sensor_df.append(cur_sensor_df, ignore_index = True)
        all_sensor_df.to_csv('../../data/database/sensor_data.csv')
        
    print("saved")

In [18]:
def read_tag_data(dir):
    tag_data = pd.read_csv(dir, skipinitialspace= True)
    tag_data.TimeStamp = pd.DataFrame(index = pd.to_datetime(tag_data.TimeStamp, utc = 'True')).tz_localize('Asia/Singapore').index
    return tag_data

def read_sensor_data(dir):
    with open(dir,'r') as f:
        first_line = f.readline()
        if first_line.startswith("Found sensor 0 "):
            sensor_data = pd.read_csv(dir, skiprows=13, skipinitialspace= True)
        else:
            sensor_data = pd.read_csv(dir, skipinitialspace= True)
    count_fequency(sensor_data)
    sensor_data.TIMESTAMP = pd.DataFrame(index = pd.to_datetime(sensor_data.TIMESTAMP, unit='ms', utc = 'True')).tz_localize('utc').tz_convert('Asia/Singapore').index
    return sensor_data

In [19]:
def time_difference(sensor_data, tag_data):
    sensor_time_calibration = sensor_data[sensor_data.SENSORTYPE == -1]
    tag_time_calibration = tag_data[tag_data.TagName == 'TIME_CALIBRATION']
    if len(sensor_time_calibration) == 0:
        print("No time calibration on Watch! ********")
        return None
    if len(sensor_time_calibration) != len(tag_time_calibration):
        print('The number of times of time calibration on watch and phone is different! ********')
        return None
   
    return np.mean(tag_time_calibration.TimeStamp - sensor_time_calibration.TIMESTAMP)


In [38]:
def check_tag_sequence(tags):
    temp = [i for i in tags if i not in ['ACTION_FINISH', 'TIME_CALIBRATION', 'wear_start', 'wear_stop']]
    tag_sequence = ['1','2','3','4','5','6','7','8','9','0']
    if len(tag_sequence) > len(temp):
        return False
    for i in range(len(tag_sequence)):
        if temp[i] != tag_sequence[i]:
            return False
    return True

In [33]:
def find_last_divide_sign(s):
    res = None
    for i in range(len(s)):
        if s[i] == '/':
            res = i
            
    return res

In [34]:
def move_to_processed_file_folder(directory):
    file_name = directory[find_last_divide_sign(directory)+1:]
    os.rename(directory, "../../data/raw_data/processed_raw_data/" + file_name)

In [35]:
def consecutive_repeated_tag(tags):
    previous_tag = "None"
    previous_index = -99
    res = []
    for i in range(len(tags)):
        if tags[i] in ['ACTION_FINISH', 'TIME_CALIBRATION', 'wear_start', 'wear_stop']:
            continue
        if (previous_tag == tags[i]):
            res.append(previous_index)
        previous_tag = tags[i]
        previous_index = i
    return res

In [39]:
def main(sensordata_dir, tagdata_dir):
# def main():
#     sensor_data = read_sensor_data('../../data/raw_data/SENSORDATA_20171219-174424.txt')
#     tag_data = read_tag_data('../../data/raw_data/Tags_20171219-174712.txt')
    print("Processing file " + sensordata_dir[find_last_divide_sign(sensordata_dir)+1:])
    try:
        sensor_data = read_sensor_data(sensordata_dir)
        tag_data = read_tag_data(tagdata_dir)
    except:
        print("Processing file " + sensordata_dir[find_last_divide_sign(sensordata_dir)+1:] + " ERROR")
        return
    
    user_groups = list(set(tag_data['Tester_Name'].values.tolist()))

    for user in user_groups:
        cur_user_tag_df = tag_data[tag_data['Tester_Name'] == user]
#         if (cur_user_tag_df.iloc[0]['TagName'] != 'wear_start'):
#             logging.error("Cannot find wear_start tag! User: " + str(user))
#             sys.exit()
#         if (len(cur_user_tag_df[cur_user_tag_df['TagName'] == 'wear_start']) > 1):
#             logging.error("There are more than 1 wear_start tags! User: " + str(user))
#             sys.exit()

        cur_user_id = get_new_tester_id()

        time_different_between_wear_phone = time_difference(sensor_data, tag_data)
        if time_different_between_wear_phone is None:
            return
        print("time_different_between_wear_phone " + str(time_different_between_wear_phone))
        sensor_data.TIMESTAMP = sensor_data.TIMESTAMP + time_different_between_wear_phone

        tags = cur_user_tag_df.TagName.tolist()
        # remove the consecutive repeated tag
        # only the latest tag will be used
        skip_tag_idx = consecutive_repeated_tag(tags)
        useful_tags_idx = []
        for i in range(len(cur_user_tag_df)):
            if i not in skip_tag_idx:
                useful_tags_idx.append(i)
                
        cur_user_tag_df = cur_user_tag_df.iloc[useful_tags_idx]
        tags = cur_user_tag_df.TagName.tolist()
        
        if (check_tag_sequence(tags) is False):
            print("Tag Sequence Wrong *********")
            return
        for i in range(len(tags)): 
            cur_tag = tags[i]
            if cur_tag in ['ACTION_FINISH', 'TIME_CALIBRATION', 'wear_start', 'wear_stop']:
                continue 
            cur_tag_start_time = (cur_user_tag_df.iloc[i].TimeStamp + pd.Timedelta('3 seconds'))

            if (i == (len(tags) - 1)):
            # if i is the last index
                cur_tag_end_time = (cur_tag_start_time + pd.Timedelta('5 seconds'))
            else:
                cur_tag_end_time = (cur_user_tag_df.iloc[i+1].TimeStamp)
                if (cur_tag_end_time - cur_tag_start_time > pd.Timedelta('5 seconds')):
                    cur_tag_end_time = (cur_tag_start_time + pd.Timedelta('5 seconds'))

            if (len(sensor_data.loc[(sensor_data.TIMESTAMP < cur_tag_end_time) & (sensor_data.TIMESTAMP > cur_tag_start_time)]) < min_samples):
                print("Something wrong with tag " + cur_tag + " for user " + str(user) + " id " + str(cur_user_id) + "**********")
                print("Number of samples is %d Less than threshold %d" % (len(sensor_data.loc[(sensor_data.TIMESTAMP < cur_tag_end_time) & (sensor_data.TIMESTAMP > cur_tag_start_time)]), min_samples))
                
                return
            sensor_data.loc[(sensor_data.TIMESTAMP < cur_tag_end_time) & (sensor_data.TIMESTAMP > cur_tag_start_time), 'TagName'] = cur_tag
            sensor_data.loc[(sensor_data.TIMESTAMP < cur_tag_end_time) & (sensor_data.TIMESTAMP > cur_tag_start_time), 'tester_id'] = cur_user_id

    save_sensor_data_into_database(sensor_data)
    save_user_info_into_database(tag_data)
    
    move_to_processed_file_folder(sensordata_dir)
    move_to_processed_file_folder(tagdata_dir)


In [41]:
all_files = os.listdir('../../data/raw_data/')
tag_files = [i for i in all_files if i.startswith('Tags_')]
for tag_file in tag_files:
    datetime = tag_file[5:]
    sensor_file = 'SENSORDATA_' + datetime
    if sensor_file in all_files:
        main('../../data/raw_data/' + sensor_file, '../../data/raw_data/' + tag_file)


Processing file SENSORDATA_20180108-144348.txt
Count per 1 second
Counter({3: 188.0767015956072, 11: 188.07216733244763, 4: 187.78197449023546, 10: 181.5110885405567, 1: 181.50655427739716, 9: 181.50655427739716, 2: 98.18946872038559, 26: 0.040808368436087296})
time_different_between_wear_phone 0 days 00:00:01.088666
saved
Processing file SENSORDATA_20180108-143759.txt
Count per 1 second
Counter({11: 190.7586201391356, 3: 190.7586201391356, 4: 190.54708298243443, 9: 182.49669722646848, 1: 182.49669722646848, 10: 182.49669722646848, 2: 98.8277648505071, 26: 0.015965068430274558})
time_different_between_wear_phone 0 days 00:00:00.447000
saved
Processing file SENSORDATA_20180103-144428.txt
Count per 1 second
Counter({3: 188.70491248146288, 11: 188.70491248146288, 4: 188.47137402351734, 10: 181.6637279744042, 1: 181.6637279744042, 9: 181.6637279744042, 2: 98.60577540606499, 26: 0.04086923014047338})
time_different_between_wear_phone 0 days 00:00:00.859333
saved
Processing file SENSORDATA_2