# Concatenate everything to save a single data array

In [2]:
# Imports

import csv
from datetime import datetime
import numpy as np
import openpyxl
import os
import pandas as pd
import shutil
import utils_cleaner

In [3]:
basepath_BCN = "./data/Barcelona"
basepath_NAP = "./data/Naples"
basepath = "./data/BCN and NAP"

basepath_BCN_FA = os.path.join(basepath_BCN, 'FA')
basepath_BCN_GM = os.path.join(basepath_BCN, 'GM')
basepath_BCN_RS = os.path.join(basepath_BCN, 'RS')

basepath_NAP_FA = os.path.join(basepath_NAP, 'FA')
basepath_NAP_GM = os.path.join(basepath_NAP, 'GM')
basepath_NAP_RS = os.path.join(basepath_NAP, 'RS')

basepath_FA = os.path.join(basepath, 'FA')
basepath_GM = os.path.join(basepath, 'GM')
basepath_RS = os.path.join(basepath, 'RS')

## Cleaning the data from Barcelona and Naples separately

### Part 1: renaming filenames

1st: clean scan names to just Pacient ID (only for Naples)

In [4]:
utils_cleaner.rename_files_to_first_8_chars(basepath_NAP_FA)
utils_cleaner.rename_files_to_first_8_chars(basepath_NAP_GM)
utils_cleaner.rename_files_to_first_8_chars(basepath_NAP_RS)

All files in ./data/Naples/FA renamed succesfully.
All files in ./data/Naples/GM renamed succesfully.
All files in ./data/Naples/RS renamed succesfully.


2nd: Create a file with corresponding old ID to new ID using the patient info file (.xlsx)

In [5]:
excel_filename_BCN = os.path.join(basepath_BCN, 'subject_clinical_data.xlsx')
csv_filename_BCN = os.path.join(basepath_BCN, 'ID_corr_BCN.csv')

utils_cleaner.save_ids_to_csv(excel_filename_BCN, csv_filename_BCN, 0)

In [6]:
df_BCN = pd.read_csv(csv_filename_BCN)
df_BCN

Unnamed: 0,ID,ID_old
0,0,002MSVIS
1,1,003MSVIS
2,2,004MSVIS
3,3,005MSVIS
4,4,010MSVIS
...,...,...
160,160,sFIS_04
161,161,sFIS_05
162,162,sFIS_06
163,163,sFIS_07


In [7]:
number_of_patients_BCN = len(df_BCN)

excel_filename_NAP = os.path.join(basepath_NAP, 'naples2barcelona_multilayer.xlsx')
csv_filename_NAP = os.path.join(basepath_NAP, 'ID_corr_NAP.csv')

utils_cleaner.save_ids_to_csv(excel_filename_NAP, csv_filename_NAP, number_of_patients_BCN)

In [8]:
df_NAP = pd.read_csv(csv_filename_NAP)
df_NAP

Unnamed: 0,ID,ID_old
0,165,sub-0001
1,166,sub-0002
2,167,sub-0003
3,168,sub-0004
4,169,sub-0005
...,...,...
100,265,sub-0101
101,266,sub-0102
102,267,sub-0103
103,268,sub-0104


In [9]:
utils_cleaner.concatenate_csv(df_BCN, df_NAP, 'ID_corr.csv')

2nd: Change file names according to the corresponding id.

In [10]:
# Read the mappings from the CSV file
id_to_name_mapping_BCN = utils_cleaner.read_csv_mapping(csv_filename_BCN)

# Rename the files in the specified folders
utils_cleaner.rename_files_in_folder_removing_suffix(basepath_BCN_FA, id_to_name_mapping_BCN, '_FA_factor.csv')
utils_cleaner.rename_files_in_folder_removing_suffix(basepath_BCN_GM, id_to_name_mapping_BCN, '_GM_matrix.csv')
utils_cleaner.rename_files_in_folder_removing_suffix(basepath_BCN_RS, id_to_name_mapping_BCN, '_r_matrix.csv')

Renamed 0 files.
Renamed 0 files.
Renamed 0 files.


In [11]:
# Read the mappings from the CSV file
id_to_name_mapping_NAP = utils_cleaner.read_csv_mapping(csv_filename_NAP)

# Rename the files in the specified folders
utils_cleaner.rename_files_in_folder(basepath_NAP_FA, id_to_name_mapping_NAP)
utils_cleaner.rename_files_in_folder(basepath_NAP_GM, id_to_name_mapping_NAP)
utils_cleaner.rename_files_in_folder(basepath_NAP_RS, id_to_name_mapping_NAP)

Renamed 0 files.
Renamed 0 files.
Renamed 0 files.


### Part 2: Saving patient's information

#### SAVED

origin: NAP for Naples, BCN for Barcelona

mstype: Nap vs Bcn: RR = 0, SP = 1, PP = 2, (healthy) empty = -1

edss: if healthy: Nap empty, Bcn 0 -> unify to 0; else ok

dobirth: date of birth

doscan: date of scan

dostart: date of disease start

gender: both have 0 for female, 1 for male


#### CALCULATED

age = (dobirth - doscan) / 365.25

DD: disease duration = (dostart - doscan) / 365.25

In [12]:
csv_corr_BCN = os.path.join(basepath_BCN, 'ID_corr_BCN.csv')

utils_cleaner.save_columns_to_csv_BCN(excel_filename_BCN, csv_corr_BCN)
df_BCN = pd.read_csv(csv_corr_BCN)
df_BCN

Unnamed: 0,ID,origin,gender,mstype,edss,dobirth,doscan,dostart,age,DD
0,0,BCN,1,1,7.5,1963-09-04,2015-03-16,1993-01-10,51.53,22.18
1,1,BCN,0,2,6.0,1959-01-18,2017-02-08,2007-07-15,58.06,9.57
2,2,BCN,1,0,3.0,1956-09-16,2017-06-29,2010-09-15,60.78,6.79
3,3,BCN,0,0,1.5,1978-02-01,2016-01-13,2007-08-01,37.95,8.45
4,4,BCN,0,0,2.0,1964-02-13,2016-10-04,2007-09-15,52.64,9.05
...,...,...,...,...,...,...,...,...,...,...
160,160,BCN,1,-1,0.0,,,,0.00,0.00
161,161,BCN,0,-1,0.0,,,,0.00,0.00
162,162,BCN,0,-1,0.0,,,,0.00,0.00
163,163,BCN,0,-1,0.0,,,,0.00,0.00


In [13]:
csv_corr_NAP = os.path.join(basepath_NAP, 'ID_corr_NAP.csv')

utils_cleaner.save_columns_to_csv_NAP(excel_filename_NAP, csv_corr_NAP, number_of_patients_BCN)
df_NAP = pd.read_csv(csv_corr_NAP)
df_NAP

Unnamed: 0,ID,origin,gender,mstype,edss,dobirth,doscan,dostart,age,DD
0,165,NAP,0,1,6.0,1964-12-05,2016-11-23,2009-09-01,51.97,7.23
1,166,NAP,1,1,6.5,1977-12-05,2016-12-02,2009-02-15,38.99,7.79
2,167,NAP,1,0,2.0,2000-09-09,2017-02-11,2015-07-15,16.42,1.58
3,168,NAP,0,0,3.5,1983-04-11,2017-01-21,2000-08-01,33.78,16.47
4,169,NAP,0,0,2.0,1982-12-12,2016-12-09,2011-09-15,33.99,5.23
...,...,...,...,...,...,...,...,...,...,...
100,265,NAP,0,-1,0.0,1970-07-12,2017-05-12,,46.83,0.00
101,266,NAP,1,-1,0.0,1993-02-17,2016-03-03,,23.04,0.00
102,267,NAP,0,-1,0.0,1971-03-16,2017-04-21,,46.10,0.00
103,268,NAP,1,-1,0.0,1985-07-01,2016-03-08,,30.69,0.00


## Save the information to concatenated files

In [14]:
utils_cleaner.concatenate_csv(df_BCN, df_NAP, 'ID_info.csv')

In [17]:
# Get the target arrays
target_class_BCN = df_BCN["mstype"].values
target_BCN = target_class_BCN + 1
target_BCN[target_BCN > 1] = 1

target_class_NAP = df_NAP["mstype"].values
target_NAP = target_class_NAP + 1
target_NAP[target_NAP > 1] = 1

In [None]:
# Concatenate the target arrays
target_class = np.concatenate((target_class_BCN, target_class_NAP))
target_class

array([ 1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  1,  1,  0,  0,  0,  2,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  2,  0,  2, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  1,  0,  0,  1,  0,  0,  1,  1,  0,  1,  0,  1,  0,  0,  0,  0,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  0,  0,  0,
        2,  0,  2,  0,  0,  2,  0,  1,  1,  2,  0,  0,  2,  0,  1,  0,  0,
        0,  0,  0,  0,  1,  1,  0,  0,  1,  0,  1,  2,  0,  0,  0,  0,  0,
        1,  0,  1,  2,  1,  1,  0,  0,  0,  1,  0,  1,  0, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [20]:
# Concatenate the target arrays
target = np.concatenate((target_BCN, target_NAP))
target

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [21]:
# Save the targets to a .npy file
np.save(os.path.join(basepath, 'target_class.npy'), target_class)
print("Data saved to target_class.npy")
np.save(os.path.join(basepath, 'target.npy'), target)
print("Data saved to target.npy")

Data saved to target_class.npy
Data saved to target.npy


Create a list with all input filenames, using the ID of each subject.

In [18]:
filenames = ["{:04d}.csv".format(x) for x in df_BCN["ID"]] + ["{:04d}.csv".format(x) for x in df_NAP["ID"]]
filenames

['0000.csv',
 '0001.csv',
 '0002.csv',
 '0003.csv',
 '0004.csv',
 '0005.csv',
 '0006.csv',
 '0007.csv',
 '0008.csv',
 '0009.csv',
 '0010.csv',
 '0011.csv',
 '0012.csv',
 '0013.csv',
 '0014.csv',
 '0015.csv',
 '0016.csv',
 '0017.csv',
 '0018.csv',
 '0019.csv',
 '0020.csv',
 '0021.csv',
 '0022.csv',
 '0023.csv',
 '0024.csv',
 '0025.csv',
 '0026.csv',
 '0027.csv',
 '0028.csv',
 '0029.csv',
 '0030.csv',
 '0031.csv',
 '0032.csv',
 '0033.csv',
 '0034.csv',
 '0035.csv',
 '0036.csv',
 '0037.csv',
 '0038.csv',
 '0039.csv',
 '0040.csv',
 '0041.csv',
 '0042.csv',
 '0043.csv',
 '0044.csv',
 '0045.csv',
 '0046.csv',
 '0047.csv',
 '0048.csv',
 '0049.csv',
 '0050.csv',
 '0051.csv',
 '0052.csv',
 '0053.csv',
 '0054.csv',
 '0055.csv',
 '0056.csv',
 '0057.csv',
 '0058.csv',
 '0059.csv',
 '0060.csv',
 '0061.csv',
 '0062.csv',
 '0063.csv',
 '0064.csv',
 '0065.csv',
 '0066.csv',
 '0067.csv',
 '0068.csv',
 '0069.csv',
 '0070.csv',
 '0071.csv',
 '0072.csv',
 '0073.csv',
 '0074.csv',
 '0075.csv',
 '0076.csv',

In [None]:
# Move all scans from Barcelona and Naples to current folder
utils_cleaner.move_files_to_folder(basepath_BCN_FA, basepath_FA)
utils_cleaner.move_files_to_folder(basepath_BCN_GM, basepath_GM)
utils_cleaner.move_files_to_folder(basepath_BCN_RS, basepath_RS)
utils_cleaner.move_files_to_folder(basepath_NAP_FA, basepath_FA)
utils_cleaner.move_files_to_folder(basepath_NAP_GM, basepath_GM)
utils_cleaner.move_files_to_folder(basepath_NAP_RS, basepath_RS)

Files moved successfully: 30.
Files moved successfully: 18.
Files moved successfully: 48.
Files moved successfully: 30.
Files moved successfully: 30.
Files moved successfully: 30.


The following lines of code will create a **4D _numpy_ array** (`data`) to store the whole dataset:
- The shape of the array should be: _(num_subjects, num_nodes, num_nodes, num_matrices)_

where:
- "num_subjects" is 165 + 105 = 270,
- "nun_nodes" is 76 (parcellation scheme)
- "num_matrices" is 3 (DTI, GM and rs-fMRI)

In [None]:
data = np.zeros(shape=(len(filenames), 76, 76, 3))

for i, filename in enumerate(filenames):
    df = pd.read_csv(os.path.join(basepath_FA, filename), header=None)
    data[i,:,:,0] = df.values

    df = pd.read_csv(os.path.join(basepath_GM, filename), header=None)
    data[i,:,:,1] = df.values

    df = pd.read_csv(os.path.join(basepath_RS, filename), header=None)
    data[i,:,:,2] = df.values

print(data.shape)

# Save the data to a .npy file
np.save(os.path.join(basepath, 'data.npy'), data)
print("Data saved to data.npy")

Since the full array is too heavy to be uploaded to Github, we will separate each scan type to a different data file.

In [14]:
basepath = './'
basepath_final = os.path.join(basepath, 'data/')

scan_types = ["FA", "GM", "RS"]

In [15]:
import importlib
importlib.reload(utils_cleaner)

<module 'utils_cleaner' from '/home/merles/Documents/git/utils_cleaner.py'>

In [16]:
utils_cleaner.save_3d_slices(data, basepath_final, scan_types)

Saved all FA scans as ./data/FA_original.npy
Saved all GM scans as ./data/GM_original.npy
Saved all RS scans as ./data/RS_original.npy
