# Statistics

## Read and merge files

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statistics
import re
import statsmodels.api as sm
import matplotlib.pyplot as plt

#### 1st rest

These files come from the first rest section

In [None]:
Sub11 = pd.read_csv("path...\KubiosHRVresults1_1.csv", header = 1) 
Sub12 = pd.read_csv("path...\KubiosHRVresults1_2.csv", header = 1) #read the data
#... add all files from Kubios HRV

##### Change the name of subjects (avoid duplicates)

In [None]:
Sub11 = Sub11.set_index("FileName") 
Sub12 = Sub12.set_index("FileName") #Change the index
#... do changes in all the df's

In [None]:
Sub11.rename(index=lambda x: x.replace('Sub1','Sub11'), inplace=True)
Sub12.rename(index=lambda x: x.replace('Sub2','Sub12'), inplace=True) #rename the subjects
#... do changes in all the df's

In [None]:
Sub11.reset_index(inplace=True)
Sub12.reset_index(inplace=True) #add the int index for future filtering
#... do changes in all the df's

#### 2nd rest

These files come from the first rest section

In [None]:
Sub21 = pd.read_csv("path...\KubiosHRVresults2_1.csv", header = 1)
Sub22 = pd.read_csv("path...\KubiosHRVresults2_1.csv", header = 1) #read the data
#... add all files from Kubios HRV

##### Change the name of subjects (avoid duplicates)

In [None]:
Sub21 = Sub21.set_index("FileName") 
Sub22 = Sub22.set_index("FileName") #Change the index
#... do changes in all the df's

In [None]:
Sub21.rename(index=lambda x: x.replace('Sub1','Sub21'), inplace=True)
Sub22.rename(index=lambda x: x.replace('Sub2','Sub22'), inplace=True) #rename the subjects
#... do changes in all the df's

In [None]:
Sub21.reset_index(inplace=True)
Sub22.reset_index(inplace=True) #add the int index for future filtering
#... do changes in all the df's

In [None]:
df = pd.concat([Sub11,Sub12,Sub21,Sub22], axis = 0) # merge all files

In [None]:
df # visualize

In [None]:
cols = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
df = df.drop(df.columns[cols],axis=1) # eliminate informational columns

In [None]:
df.drop(columns=["Unnamed: 103"], inplace=True) # eliminate "random" column

In [None]:
df = df.sort_index(ascending=True) #sort the data

In [None]:
df1 = df.loc[df.index==0] # divide according to the source into df's (empatica)
df2 = df.loc[df.index==1] # divide according to the source into df's (faros)

In [None]:
df1 = df1.set_index("FileName") 
df2 = df2.set_index("FileName") #set the new indices

In [None]:
df1 #visualize

In [None]:
df2 #visualize

## Statistical Tests

### Mean Difference - Global

In [None]:
for column in df1.columns: #mean difference between the complete datasets
        x = (df1[column].astype(float).mean() - df2[column].astype(float).mean()) 
        x = abs (x)
        print("The Mean Difference in {} is {}.".format(column, x))

<p><br>

 **Due to the existance of several NaN values, it is important to deal with them for the further calculations. Here we show 2 approaches:**



## Drop all NaN values

In [None]:
df1_min = df1.astype(float).dropna(axis=1)
df2_min = df2.astype(float).dropna(axis=1) # form new dataframes with columns that have all the values a.k.a. exclude all NaN's

<p><br>
Decide which dataset we will use for reference

In [None]:
df1_min.columns.isin(df2_min.columns) #check the existance of df1 columns in df2 = all of them

In [None]:
df2_min.columns.isin(df1_min.columns) #check the existance of df2 columns in df1 = Not all of them

<p><br>
We take then df1_min as our reference list

### Pearson correlation

In [None]:
for column in df1_min.columns:
        r, p = stats.pearsonr(df1_min[column], df2_min[column]) #pearson correlation
        print("The correlation coefficient in {} is {} and the p-value {}.".format(column, r, p))

### Student t-test

In [None]:
for column in df1_min.columns:
        r, p = stats.ttest_rel(df1_min[column], df2_min[column]) #paired ttest
        print("The T-Score in {} is {} and the p-value {}.".format(column, r, p))

## Conservative approach with NaN

Here the idea is to maintain as much data as possible, for this reason we not delete the whole column with NaN values, rather than only
the specific value and its counterpart in the other DF (because is not longer possible to do a comparison)

In [None]:
df1_max = df1.astype(float)
df2_max = df2.astype(float) # create new datasets.

In [None]:
df1_max.rename(index=lambda x: x.replace('_empatica',''), inplace=True)
df2_max.rename(index=lambda x: x.replace('_faros',''), inplace=True) #delete the subfix for the index filename

In [None]:
df1_max #visualize

In [None]:
df_merged = df1_max.merge(df2_max, suffixes=[" _empatica", " _faros"], on="FileName") 
#merge both dataframe horizontally and add the suffixes to the columns where they originally came from

In [None]:
df_merged #visualize

In [None]:
column_names = df1_max.columns #we create our reference list of indeces for the analisys

In [None]:
x = pd.DataFrame(column_names) # transform our list into a dataset

In [None]:
# the dataframe contain names with brakets which make them not viable for using the the regex comand, therefore, we need to drop the brakets
# we define a function
def Clean_names(Names): 
    if re.search('\(.*', Names): # Search for opening bracket in the name followed by any characters repeated any number of times    
        pos = re.search('\(.*', Names).start() # Extract the position of beginning of pattern      
        return Names[:pos]  # return the cleaned name
    else:        
        return Names # if clean up needed return the same name
          
# Updated the names columns
column_names1 = x[x.columns[0]].apply(Clean_names)

### Pearson correlation

In [None]:
for column in column_names1:
    d = df_merged.filter(regex=column) #filter from the combine dataset only the columns that start with the "name" at a time
    d = d.dropna() #drop the NaN for the specified duplet
    if len(d) > 2: # setting a threshold of at least 3 values
        r, p = stats.pearsonr(d[d.columns[0]], d[d.columns[1]]) #Pearson correlation
        print("The correlation coefficient in {} is {} and the p-value {}.".format(column, r, p))

### Student t-test

In [None]:
for column in column_names1:
    d = df_merged.filter(regex=column)  #filter from the combine dataset only the columns that start with the "name" at a time
    d = d.dropna() #drop the NaN for the specified duplet
    if len(d) > 2:    # setting a threshold of at least 3 values
        r, p = stats.ttest_rel(d[d.columns[0]], d[d.columns[1]]) #paired ttest
        print("The T-Score in {} is {} and the p-value {}.".format(column, r, p))

### Bland-Altman Plot

In [None]:
font1 = {'family':'serif','color':'blue','size':20}

for column in column_names1:
    d = df_merged.filter(regex=column)  #filter from the combine dataset only the columns that start with the "name" at a time
    d = d.dropna() #drop the NaN for the specified duplet
    if len(d) > 2:    # setting a threshold of at least 3 values
#create Bland-Altman plot                  
        f, ax = plt.subplots(1, figsize = (8,5))
        sm.graphics.mean_diff_plot(d[d.columns[0]], d[d.columns[1]], ax = ax)
        plt.title(column, fontdict = font1)

#display Bland-Altman plot
        plt.show()