In [2]:
import os
import random
import cv2
import pandas as pd
import numpy as np

In [3]:
import numpy as np
import pandas as pd

def reduce_memory_usage(df):
    # Check if input is a Pandas DataFrame
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame.")
    
    # Calculate the memory usage of the DataFrame before optimization
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))


    # Loop through each column of the DataFrame
    for col in df.columns:
        col_type = df[col].dtype
        
        # If column is of 'object' type, convert it to 'category'
        if col_type == 'object':
            df[col] = df[col].astype('category')
        
        # If column is a datetime type, leave it as is
        elif col_type.name.startswith('datetime'):
            pass
        
        # For numerical columns, downcast the data type to reduce memory usage
        else:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if col_type == np.int64:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            elif col_type == np.float64:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
            else:
                # If column is of any other data type, convert it to 'category'
                df[col] = df[col].astype('category')
    
    # Calculate the memory usage of the DataFrame after optimization
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    # Remove intermediate variables to free up memory
    del start_mem, end_mem
    return df.copy()


In [None]:
import glob
import pandas as pd

# specify the root directory containing CSV files
DATA_ROOT_DEFOG = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/'

# function to read and concatenate CSV files
def read_csv_files(root):
    
    # get a list of file paths that match the pattern `f"{root}/*.csv"`
    file_paths = glob.glob(f"{root}/*.csv")
    
    # if there are no files that match the pattern, raise an exception
    if not file_paths:
        raise ValueError(f"No CSV files found in {root}")
    
    # read each CSV file and append it to a list
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        df['file'] = file_path.split('/')[-1].split('.')[0]
        dfs.append(df)
    
    # concatenate all DataFrames in the list into a single DataFrame
    df = pd.concat(dfs, axis=0)
    
    # return the concatenated DataFrame
    return df

# read and concatenate all CSV files using the read_csv_files function
defog = read_csv_files(DATA_ROOT_DEFOG)

In [5]:
defog = reduce_memory_usage(defog)

Memory usage of dataframe is 1715.58 MB
Memory usage after optimization is: 309.59 MB
Decreased by 82.0%


In [6]:
defog = defog[(defog['Task']==1)&(defog['Valid']==1)]

In [7]:
print('the shape of defog dataset is {}'.format(defog.shape))

the shape of defog dataset is (4111322, 10)


In [8]:
defog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv")
defog_metadata

Unnamed: 0,Id,Subject,Visit,Medication
0,02ab235146,ab54e1,2,on
1,02ea782681,bf608b,2,on
2,06414383cf,c0b71e,2,off
3,092b4c1819,b6a627,1,off
4,0a900ed8a2,b7bd52,2,on
...,...,...,...,...
132,f3a921edee,ce8b0b,1,off
133,f40e8c6ebe,d9529b,1,off
134,f8ddbdd98d,fc1e1b,1,on
135,f9efef91fb,fe5d84,2,off


In [9]:
defog_m= defog_metadata.merge(defog, how = 'inner', left_on = 'Id', right_on = 'file')
defog_m.drop(['file','Valid','Task'], axis = 1, inplace = True)
defog_m

Unnamed: 0,Id,Subject,Visit,Medication,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking
0,02ea782681,bf608b,2,on,1000,-0.970215,0.061615,-0.265625,0,0,0
1,02ea782681,bf608b,2,on,1001,-0.984375,0.044495,-0.265625,0,0,0
2,02ea782681,bf608b,2,on,1002,-0.984375,0.029022,-0.265625,0,0,0
3,02ea782681,bf608b,2,on,1003,-0.984375,0.015625,-0.265625,0,0,0
4,02ea782681,bf608b,2,on,1004,-0.984863,0.015327,-0.265625,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4111317,f9fc61ce85,a7d8c0,1,on,119027,-0.961426,0.142456,-0.289551,0,0,0
4111318,f9fc61ce85,a7d8c0,1,on,119028,-0.960449,0.142822,-0.290527,0,0,0
4111319,f9fc61ce85,a7d8c0,1,on,119029,-0.958008,0.145508,-0.290039,0,0,0
4111320,f9fc61ce85,a7d8c0,1,on,119030,-0.960449,0.145874,-0.291504,0,0,0


In [10]:
# summary table function
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values * 100
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    
    return summ

In [11]:
summary(defog_m)

data shape: (4111322, 11)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
Id,object,0,0.0,91,,,02ea782681,02ea782681,02ea782681
Subject,object,0,0.0,38,,,bf608b,bf608b,bf608b
Visit,int64,0,0.0,2,1.0,2.0,2,2,2
Medication,object,0,0.0,2,,,on,on,on
Time,int32,0,0.0,338777,1000.0,414387.0,1000,1001,1002
AccV,float16,0,0.0,3608,-6.023438,4.457031,-0.970215,-0.984375,-0.984375
AccML,float16,0,0.0,28103,-2.115234,4.523438,0.061615,0.044495,0.029022
AccAP,float16,0,0.0,23951,-5.117188,4.386719,-0.265625,-0.265625,-0.265625
StartHesitation,int8,0,0.0,2,0.0,1.0,0,0,0
Turn,int8,0,0.0,2,0.0,1.0,0,0,0


In [12]:
# garbage collection for memory
import gc
gc.collect()

23

In [13]:
DATA_ROOT_TDCSFOG = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/'
tdcsfog = pd.DataFrame()
for root, dirs, files in os.walk(DATA_ROOT_TDCSFOG):
    for name in files:       
        f = os.path.join(root, name)
        df_list= pd.read_csv(f)
        words = name.split('.')[0]
        df_list['file']= name.split('.')[0]
        tdcsfog = pd.concat([tdcsfog, df_list], axis=0)
tdcsfog

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,file
0,0,-9.665890,0.042550,0.184744,0,0,0,a171e61840
1,1,-9.672969,0.049217,0.184644,0,0,0,a171e61840
2,2,-9.670260,0.033620,0.193790,0,0,0,a171e61840
3,3,-9.673356,0.035159,0.184369,0,0,0,a171e61840
4,4,-9.671458,0.043913,0.197814,0,0,0,a171e61840
...,...,...,...,...,...,...,...,...
5153,5153,-9.915920,-0.105897,-1.123455,0,0,0,0506d9a39f
5154,5154,-9.693752,-0.066892,-1.114903,0,0,0,0506d9a39f
5155,5155,-9.548118,-0.098315,-1.112123,0,0,0,0506d9a39f
5156,5156,-9.469803,-0.111004,-1.130814,0,0,0,0506d9a39f


In [14]:
tdcsfog = reduce_memory_usage(tdcsfog)

Memory usage of dataframe is 882.35 MB
Memory usage after optimization is: 155.00 MB
Decreased by 82.4%


In [15]:
tdcsfog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv")
tdcsfog_metadata

Unnamed: 0,Id,Subject,Visit,Test,Medication
0,003f117e14,13abfd,3,2,on
1,009ee11563,d81e3a,4,2,on
2,011322847a,203e85,2,2,on
3,01d0fe7266,203e85,2,1,off
4,024418ba39,cecfb8,19,3,on
...,...,...,...,...,...
828,feba449e1a,47860d,19,1,on
829,ff4f844fd3,43fcae,2,3,on
830,ff53514514,a2a051,2,3,on
831,ff92d9244d,a9e866,20,2,on


In [16]:
tdcsfog_m= tdcsfog_metadata.merge(tdcsfog, how = 'inner', left_on = 'Id', right_on = 'file')
tdcsfog_m.drop(['file'], axis = 1, inplace = True)
tdcsfog_m

Unnamed: 0,Id,Subject,Visit,Test,Medication,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking
0,003f117e14,13abfd,3,2,on,0,-9.531250,0.566406,-1.413086,0,0,0
1,003f117e14,13abfd,3,2,on,1,-9.539062,0.563965,-1.440430,0,0,0
2,003f117e14,13abfd,3,2,on,2,-9.531250,0.561523,-1.429688,0,0,0
3,003f117e14,13abfd,3,2,on,3,-9.531250,0.564453,-1.415039,0,0,0
4,003f117e14,13abfd,3,2,on,4,-9.539062,0.562012,-1.429688,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7062667,ffda8fadfd,bae0ce,20,1,off,4220,-9.406250,0.088989,-3.220703,0,0,0
7062668,ffda8fadfd,bae0ce,20,1,off,4221,-9.406250,0.090515,-3.216797,0,0,0
7062669,ffda8fadfd,bae0ce,20,1,off,4222,-9.406250,0.084351,-3.224609,0,0,0
7062670,ffda8fadfd,bae0ce,20,1,off,4223,-9.406250,0.084229,-3.236328,0,0,0


In [17]:
# garbage collection for memory
import gc
gc.collect()

92

In [18]:
conditions = [
    (defog_m['StartHesitation'] == 1),
    (defog_m['Turn'] == 1),
    (defog_m['Walking'] == 1)]
choices = ['StartHesitation', 'Turn', 'Walking']
defog_m['event'] = np.select(conditions, choices, default='Normal')

In [19]:
defog_m['event'].value_counts().to_frame().style.background_gradient()

Unnamed: 0,event
Normal,3626333
Turn,414380
Walking,70521
StartHesitation,88


In [20]:
train_df = defog_m[['AccV','AccML','AccAP','event']]

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df['target'] = le.fit_transform(train_df['event'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [22]:
X = train_df.drop(['event','target'], axis=1)
y = train_df['target']

In [23]:
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=7
params['num_class']=4 #no.of unique values in the target class not inclusive of the end value
params['verbose']=-1
#training the model
clf=lgb.train(params,d_train,1000)  #training the model on 1,000 epocs
#prediction on the test dataset
y_pred_1=clf.predict(X_test)

NameError: name 'lgb' is not defined

In [None]:
y_pred_1[:1]

In [None]:
# 'macro' option is to calculate metrics for each label, and find their unweighted mean. 
# This does not take label imbalance into account.
from sklearn.metrics import precision_score
precision_score(y_test, np.argmax(y_pred_1, axis=-1), average='macro')

In [None]:
test_defog_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/02ab235146.csv'
test_defog = pd.read_csv(test_defog_path)
name = os.path.basename(test_defog_path)
id_value = name.split('.')[0]
test_defog['Id_value'] = id_value
test_defog['Id'] = test_defog['Id_value'].astype(str) + '_' + test_defog['Time'].astype(str)
test_defog = test_defog[['Id','AccV','AccML','AccAP']]
test_defog.set_index('Id',inplace=True)

In [None]:
# predict event probability
test_defog_pred=clf.predict(test_defog)
test_defog['event'] = np.argmax(test_defog_pred, axis=-1)

In [None]:
# expand event column it to three columns
test_defog['StartHesitation'] = np.where(test_defog['event']==1, 1, 0)
test_defog['Turn'] = np.where(test_defog['event']==2, 1, 0)
test_defog['Walking'] = np.where(test_defog['event']==3, 1, 0)

In [None]:
test_defog.head(10)

In [None]:
test_tdcsfog_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/003f117e14.csv'
test_tdcsfog = pd.read_csv(test_tdcsfog_path)
name = os.path.basename(test_tdcsfog_path)
id_value = name.split('.')[0]
test_tdcsfog['Id_value'] = id_value
test_tdcsfog['Id'] = test_tdcsfog['Id_value'].astype(str) + '_' + test_tdcsfog['Time'].astype(str)
test_tdcsfog = test_tdcsfog[['Id','AccV','AccML','AccAP']]
test_tdcsfog.set_index('Id',inplace=True)

In [None]:
test_tdcsfog_pred=clf.predict(test_tdcsfog)
test_tdcsfog['event'] = np.argmax(test_tdcsfog_pred, axis=-1)

In [None]:
test_tdcsfog['StartHesitation'] = np.where(test_tdcsfog['event']==1, 1, 0)
test_tdcsfog['Turn'] = np.where(test_tdcsfog['event']==2, 1, 0)
test_tdcsfog['Walking'] = np.where(test_tdcsfog['event']==3, 1, 0)
test_tdcsfog.reset_index('Id', inplace=True)

In [None]:
test_tdcsfog.head(10)

In [None]:
submit = pd.concat([test_tdcsfog,test_defog])
submit = submit[['Id', 'StartHesitation', 'Turn','Walking']]

In [None]:
submit.head(10)

In [None]:
sample = pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv')

In [None]:
sample.head(10)