In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm

In [2]:
df = pd.read_csv('pads-parkinsons-disease-smartwatch-dataset-1.0.0/movement/timeseries/001_CrossArms_LeftWrist.txt', delimiter=',')

In [3]:
column_names = ['Timestamp', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']
df.columns = column_names
df

Unnamed: 0,Timestamp,Accelerometer_X,Accelerometer_Y,Accelerometer_Z,Gyroscope_X,Gyroscope_Y,Gyroscope_Z
0,0.009805,0.002275,0.013380,0.005852,-0.014924,-0.025833,0.004746
1,0.019807,0.001332,0.016225,0.005543,-0.009529,-0.033264,0.005853
2,0.029791,0.001354,0.018107,0.007207,-0.007440,-0.029995,0.009008
3,0.039801,0.001441,0.016004,0.004999,-0.011757,-0.027815,0.017495
4,0.052410,0.000535,0.015800,0.005716,-0.013891,-0.029924,0.022822
...,...,...,...,...,...,...,...
1018,10.185437,0.001216,-0.002975,0.003254,-0.007474,-0.013167,-0.019772
1019,10.195432,0.002255,0.001852,0.003247,-0.005365,-0.010989,-0.016599
1020,10.205442,0.003365,0.001779,0.001302,-0.006427,-0.012057,-0.015529
1021,10.215447,0.001560,-0.000262,0.000336,-0.004265,-0.013159,-0.021900


The authors say - At the start of every recording, the smartwatches gave a vibration notification. Therefore, we recommend to cut out approximately the first 0.5 seconds per time series.

In [4]:
df = df[df['Timestamp']>0.5]
df

Unnamed: 0,Timestamp,Accelerometer_X,Accelerometer_Y,Accelerometer_Z,Gyroscope_X,Gyroscope_Y,Gyroscope_Z
50,0.509774,-0.050591,-0.051501,-0.083111,0.773790,0.420087,-0.436407
51,0.519775,-0.046225,-0.052912,-0.077044,0.854607,0.477981,-0.488896
52,0.529748,-0.055828,-0.035950,-0.079996,0.975002,0.527595,-0.545608
53,0.539816,-0.086870,-0.040002,-0.097529,1.063465,0.565223,-0.600095
54,0.549893,-0.109431,-0.042224,-0.107230,1.147920,0.575992,-0.667155
...,...,...,...,...,...,...,...
1018,10.185437,0.001216,-0.002975,0.003254,-0.007474,-0.013167,-0.019772
1019,10.195432,0.002255,0.001852,0.003247,-0.005365,-0.010989,-0.016599
1020,10.205442,0.003365,0.001779,0.001302,-0.006427,-0.012057,-0.015529
1021,10.215447,0.001560,-0.000262,0.000336,-0.004265,-0.013159,-0.021900


Other features

In [2]:
other_features = pd.read_csv('pads-parkinsons-disease-smartwatch-dataset-1.0.0/preprocessed/file_list.csv')
other_features

Unnamed: 0,resource_type,id,study_id,condition,disease_comment,age_at_diagnosis,age,height,weight,gender,handedness,appearance_in_kinship,appearance_in_first_grade_kinship,effect_of_alcohol_on_tremor,label
0,patient,1,PADS,Healthy,-,56,56,173,78,male,right,True,True,Unknown,0
1,patient,2,PADS,Other Movement Disorders,Left-Sided resting tremor and hypokinesia with...,69,81,193,104,male,right,False,,No effect,2
2,patient,3,PADS,Healthy,-,45,45,170,78,female,right,False,,Unknown,0
3,patient,4,PADS,Parkinson's,IPS akinetic-rigid type,63,67,161,90,female,right,False,,No effect,1
4,patient,5,PADS,Parkinson's,IPS tremordominant type,65,75,172,86,male,left,False,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,patient,465,PADS,Parkinson's,IPS mixed type,62,65,175,80,male,right,True,False,No effect,1
465,patient,466,PADS,Healthy,-,84,84,172,74,female,right,True,True,No effect,0
466,patient,467,PADS,Parkinson's,"Essential Tremor, starting IPS tremordominant ...",55,57,190,100,male,right,False,,Improvement,1
467,patient,468,PADS,Parkinson's,IPS mixed type,73,76,198,118,male,right,False,,No effect,1


This is the important stuff and the function

In [3]:
label_counts = other_features['label'].value_counts()

# Display the count of each label
print(label_counts)

label
1    276
2    114
0     79
Name: count, dtype: int64


In [None]:
new_columns = ['Accelerometer_X_mean', 'Accelerometer_Y_mean', 'Accelerometer_Z_mean', 'Gyroscope_X_mean', 'Gyroscope_Y_mean', 'Gyroscope_Z_mean', 'Label']

In [40]:
path = 'pads-parkinsons-disease-smartwatch-dataset-1.0.0/movement/timeseries/'

def get_two_dataframes(event_name):
    file_pattern = f"{path}/*_{event_name}.txt"
    
    file_list = glob.glob(file_pattern)
    first_five_seconds_list = []
    rest_list = []
    
    for filename in tqdm(file_list, desc='Processing files'):
        df = pd.read_csv(filename, delimiter=',')  
        df.columns = column_names
        df = df[(df['Timestamp'] > 0.5) & (df['Timestamp'] < 10.24)]
    
        base_name = os.path.basename(filename)
        unique_identifier = int(base_name.replace(f"_{event_name}.txt", ''))
    
        # Add the unique identifier as a new column
        df['Patient_ID'] = unique_identifier
    
        label_value = other_features.loc[other_features['id'] == unique_identifier, 'label'].iloc[0]
        
        # Assign this label value to the new column in df
        df['Label'] = label_value
    
        #now we will split the first 5 seconds and the rest into two different dataframes
        first_five_seconds_data = df[df['Timestamp'] <= 5]
    
        last_seconds_data = df[df['Timestamp'] > 5]
    
        mean_values = first_five_seconds_data[['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z',
                                 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']].mean()
    
        mean_dict = {
        'Accelerometer_X_mean': mean_values['Accelerometer_X'],
        'Accelerometer_Y_mean': mean_values['Accelerometer_Y'],
        'Accelerometer_Z_mean': mean_values['Accelerometer_Z'],
        'Gyroscope_X_mean': mean_values['Gyroscope_X'],
        'Gyroscope_Y_mean': mean_values['Gyroscope_Y'],
        'Gyroscope_Z_mean': mean_values['Gyroscope_Z'],
        'Label': df['Label'].iloc[0],
        'Patient_ID': df['Patient_ID'].iloc[0]
        }
        
        # Convert the dictionary to a DataFrame
        mean_df = pd.DataFrame([mean_dict])
    
        first_five_seconds_list.append(mean_df)
    
        mean_values = last_seconds_data[['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z',
                                 'Gyroscope_X', 'Gyroscope_Y', 'Gyroscope_Z']].mean()
    
        mean_dict = {
        'Accelerometer_X_mean': mean_values['Accelerometer_X'],
        'Accelerometer_Y_mean': mean_values['Accelerometer_Y'],
        'Accelerometer_Z_mean': mean_values['Accelerometer_Z'],
        'Gyroscope_X_mean': mean_values['Gyroscope_X'],
        'Gyroscope_Y_mean': mean_values['Gyroscope_Y'],
        'Gyroscope_Z_mean': mean_values['Gyroscope_Z'],
        'Label': df['Label'].iloc[0],
        'Patient_ID': df['Patient_ID'].iloc[0]
        }
        
        # Convert the dictionary to a DataFrame
        mean_df = pd.DataFrame([mean_dict])
    
        rest_list.append(mean_df)
    
    first_five_seconds_list_df = pd.concat(first_five_seconds_list, ignore_index=True)
    rest_df = pd.concat(rest_list, ignore_index=True)
    return first_five_seconds_list_df, rest_df

In [41]:
first_five, rest = get_two_dataframes('CrossArms_LeftWrist')
first_five

Processing files: 100%|██████████████████████| 469/469 [00:03<00:00, 153.81it/s]


Unnamed: 0,Accelerometer_X_mean,Accelerometer_Y_mean,Accelerometer_Z_mean,Gyroscope_X_mean,Gyroscope_Y_mean,Gyroscope_Z_mean,Label,Patient_ID
0,0.078888,0.022117,0.083376,0.270689,0.223543,-0.559312,2,224
1,0.048170,0.000354,0.012075,-0.060996,-0.155827,-0.602306,1,393
2,-0.014036,-0.014067,0.037576,-0.143914,0.052604,-0.433262,1,154
3,0.059558,0.036247,-0.010402,-0.311721,0.209145,-0.613398,1,458
4,-0.019137,-0.021360,0.018636,0.031197,0.039096,-0.169866,1,331
...,...,...,...,...,...,...,...,...
464,0.039211,-0.008724,0.023515,-0.082592,0.360485,-0.323836,1,25
465,0.047207,0.025210,0.026003,0.065835,0.007757,-0.572447,0,402
466,0.067727,-0.029562,0.009231,0.113793,0.506931,-0.216498,1,87
467,0.042661,0.009035,0.006255,-0.079779,-0.019240,-0.585376,1,240


In [42]:
rest

Unnamed: 0,Accelerometer_X_mean,Accelerometer_Y_mean,Accelerometer_Z_mean,Gyroscope_X_mean,Gyroscope_Y_mean,Gyroscope_Z_mean,Label,Patient_ID
0,-0.000233,0.002024,0.000749,-0.001099,-0.000927,-0.001258,2,224
1,-0.001614,0.003825,-0.000276,-0.010429,-0.004088,0.005676,1,393
2,0.002607,0.001212,0.000329,0.005582,0.001016,0.001927,1,154
3,-0.001622,0.000644,0.000079,0.001224,-0.002149,0.002126,1,458
4,-0.001445,0.001576,0.002075,-0.006636,0.002937,-0.001631,1,331
...,...,...,...,...,...,...,...,...
464,-0.000975,-0.001221,0.000929,0.005014,-0.007595,-0.007050,1,25
465,-0.002082,0.004585,0.003324,-0.000350,-0.006961,-0.000974,0,402
466,0.010538,-0.000418,0.001422,-0.001525,0.002047,0.001241,1,87
467,-0.000611,0.004454,0.003068,-0.001767,-0.004161,0.003792,1,240


Streaming Stuff - Not needed for now

In [13]:
import time
import pandas as pd

# Load data from a CSV file
data = combined_df[combined_df['Patient_ID']==1]
data.sort_values('Timestamp', inplace=True)

# Initialize the starting point
start_time = time.time()

# Simulate streaming
for index, row in data.iterrows():
    # Calculate the target time to send the data
    target_time = start_time + row['Timestamp']  # row['timestamp'] is the seconds from the start
    current_time = time.time()
    sleep_time = target_time - current_time
    
    if sleep_time > 0:
        time.sleep(sleep_time)  # Wait until it's time to emit the data

    print(row)  # Replace this with sending the row to your streaming platform

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.sort_values('Timestamp', inplace=True)


Timestamp          0.509774
Accelerometer_X   -0.050591
Accelerometer_Y   -0.051501
Accelerometer_Z   -0.083111
Gyroscope_X        0.773790
Gyroscope_Y        0.420087
Gyroscope_Z       -0.436407
Patient_ID         1.000000
Label              0.000000
Name: 20431, dtype: float64
Timestamp          0.519775
Accelerometer_X   -0.046225
Accelerometer_Y   -0.052912
Accelerometer_Z   -0.077044
Gyroscope_X        0.854607
Gyroscope_Y        0.477981
Gyroscope_Z       -0.488896
Patient_ID         1.000000
Label              0.000000
Name: 20432, dtype: float64
Timestamp          0.529748
Accelerometer_X   -0.055828
Accelerometer_Y   -0.035950
Accelerometer_Z   -0.079996
Gyroscope_X        0.975002
Gyroscope_Y        0.527595
Gyroscope_Z       -0.545608
Patient_ID         1.000000
Label              0.000000
Name: 20433, dtype: float64
Timestamp          0.539816
Accelerometer_X   -0.086870
Accelerometer_Y   -0.040002
Accelerometer_Z   -0.097529
Gyroscope_X        1.063465
Gyroscope_Y        0