In [65]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [66]:
data_dir = './GSE61260/'

In [67]:
df_bio = pd.read_csv(os.path.join(data_dir, 'GSE61260.tsv'), sep='\t')

df_bio_t = df_bio.set_index('Gene').T
df_bio_t.index.name = 'Sample_ID'
df_bio_t = df_bio_t.reset_index()
df_bio_t.head()

Gene,Sample_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283740,ENSG00000283787,ENSG00000283907,ENSG00000283913,ENSG00000284032,ENSG00000284373,ENSG00000284387,ENSG00000284395,ENSG00000284505,ENSG00000284552
0,GSM1501013,8.002176,0.175238,4.949994,3.285351,0.959516,0.672894,59.325067,4.168056,8.104327,...,0.324708,-0.246719,3.031789,0.903138,1.222211,0.132392,0.491394,-0.079424,-0.255411,3.071849
1,GSM1501014,23.292853,-0.031148,5.657237,2.786927,1.203578,0.358625,46.002714,4.889903,7.58806,...,0.598294,0.040158,1.819534,-0.021741,0.487849,-0.066907,0.195198,0.178077,-0.190626,1.39932
2,GSM1501015,10.55789,0.097413,7.680354,2.795125,0.789783,0.733455,43.835288,5.133417,9.559194,...,0.647441,-0.248596,1.914932,0.424076,0.423539,-0.155705,0.086307,0.23039,-0.48985,1.632879
3,GSM1501016,10.272135,0.066298,8.835539,3.166677,0.941032,0.755181,51.390227,4.168056,10.980774,...,0.446519,-0.08686,1.313128,0.501803,0.357365,-0.321937,0.232894,-0.066718,-0.233382,2.117271
4,GSM1501017,8.290414,-0.039647,5.792685,2.203175,0.846344,0.612263,55.42344,3.657327,10.710821,...,0.82074,0.337941,4.507792,0.120017,0.452607,-0.263615,0.443599,0.291308,-0.474034,3.032463


In [68]:
df_meta = pd.read_csv(os.path.join(data_dir, 'metadata_GSE61260.tsv'), sep='\t')
df_meta.head()

Unnamed: 0,refinebio_accession_code,experiment_accession,refinebio_age,refinebio_cell_line,refinebio_compound,refinebio_developmental_stage,refinebio_disease,refinebio_disease_stage,refinebio_genetic_information,refinebio_organism,...,scan_protocol,series_id,source_name_ch1,status,submission_date,supplementary_file,taxid_ch1,title,treatment_protocol_ch1,type
0,GSM1501013,GSE61260,70.0,,,,normal control,,,HOMO_SAPIENS,...,according to the manufacturers protocols,"['GSE61256', 'GSE61260']",liver tissue,Public on Nov 03 2014,Sep 09 2014,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1501...,9606,Human liver expression data from subject S11a1...,not applicable,RNA
1,GSM1501014,GSE61260,49.0,,,,healthy obese,,,HOMO_SAPIENS,...,according to the manufacturers protocols,"['GSE61256', 'GSE61260']",liver tissue,Public on Nov 03 2014,Sep 09 2014,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1501...,9606,Human liver expression data from subject S11a1...,not applicable,RNA
2,GSM1501015,GSE61260,76.0,,,,normal control,,,HOMO_SAPIENS,...,according to the manufacturers protocols,"['GSE61256', 'GSE61260']",liver tissue,Public on Nov 03 2014,Sep 09 2014,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1501...,9606,Human liver expression data from subject S11a1...,not applicable,RNA
3,GSM1501016,GSE61260,48.0,,,,normal control,,,HOMO_SAPIENS,...,according to the manufacturers protocols,"['GSE61256', 'GSE61260']",liver tissue,Public on Nov 03 2014,Sep 09 2014,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1501...,9606,Human liver expression data from subject S11a1...,not applicable,RNA
4,GSM1501017,GSE61260,73.0,,,,normal control,,,HOMO_SAPIENS,...,according to the manufacturers protocols,"['GSE61256', 'GSE61260']",liver tissue,Public on Nov 03 2014,Sep 09 2014,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1501...,9606,Human liver expression data from subject S11a1...,not applicable,RNA


In [69]:
df_combined = pd.merge(df_bio_t, df_meta[['refinebio_accession_code', 'refinebio_disease']],
                           left_on='Sample_ID', right_on='refinebio_accession_code', how='inner').drop(columns=['refinebio_accession_code'])
df_combined.rename(columns = {"refinebio_disease" : "Disease"}, inplace=True)
df_combined.head()

Unnamed: 0,Sample_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283787,ENSG00000283907,ENSG00000283913,ENSG00000284032,ENSG00000284373,ENSG00000284387,ENSG00000284395,ENSG00000284505,ENSG00000284552,Disease
0,GSM1501013,8.002176,0.175238,4.949994,3.285351,0.959516,0.672894,59.325067,4.168056,8.104327,...,-0.246719,3.031789,0.903138,1.222211,0.132392,0.491394,-0.079424,-0.255411,3.071849,normal control
1,GSM1501014,23.292853,-0.031148,5.657237,2.786927,1.203578,0.358625,46.002714,4.889903,7.58806,...,0.040158,1.819534,-0.021741,0.487849,-0.066907,0.195198,0.178077,-0.190626,1.39932,healthy obese
2,GSM1501015,10.55789,0.097413,7.680354,2.795125,0.789783,0.733455,43.835288,5.133417,9.559194,...,-0.248596,1.914932,0.424076,0.423539,-0.155705,0.086307,0.23039,-0.48985,1.632879,normal control
3,GSM1501016,10.272135,0.066298,8.835539,3.166677,0.941032,0.755181,51.390227,4.168056,10.980774,...,-0.08686,1.313128,0.501803,0.357365,-0.321937,0.232894,-0.066718,-0.233382,2.117271,normal control
4,GSM1501017,8.290414,-0.039647,5.792685,2.203175,0.846344,0.612263,55.42344,3.657327,10.710821,...,0.337941,4.507792,0.120017,0.452607,-0.263615,0.443599,0.291308,-0.474034,3.032463,normal control


In [70]:
# le = LabelEncoder()

# y = le.fit_transform(df_combined['Disease'])

# print(le.classes_)

In [71]:
# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# disease_encoded = encoder.fit_transform(df_combined[['Disease']])

# disease_df = pd.DataFrame(disease_encoded, columns=encoder.get_feature_names_out(['Disease']))

# df_combined_encoded = pd.concat([df_combined.drop('Disease', axis=1), disease_df], axis=1)

# df_combined_encoded.rename(columns = {"Disease_normal control" : "Disease_normal_control",
#                                       "Disease_healthy obese" : "Disease_healthy_obese"}, inplace=True)

# df_combined_encoded.head()

In [75]:



df_combined.to_csv('GSE61260/combined.csv', index=False)

df_combined.head()

Unnamed: 0,Sample_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000283787,ENSG00000283907,ENSG00000283913,ENSG00000284032,ENSG00000284373,ENSG00000284387,ENSG00000284395,ENSG00000284505,ENSG00000284552,Disease
0,GSM1501013,8.002176,0.175238,4.949994,3.285351,0.959516,0.672894,59.325067,4.168056,8.104327,...,-0.246719,3.031789,0.903138,1.222211,0.132392,0.491394,-0.079424,-0.255411,3.071849,normal control
1,GSM1501014,23.292853,-0.031148,5.657237,2.786927,1.203578,0.358625,46.002714,4.889903,7.58806,...,0.040158,1.819534,-0.021741,0.487849,-0.066907,0.195198,0.178077,-0.190626,1.39932,healthy obese
2,GSM1501015,10.55789,0.097413,7.680354,2.795125,0.789783,0.733455,43.835288,5.133417,9.559194,...,-0.248596,1.914932,0.424076,0.423539,-0.155705,0.086307,0.23039,-0.48985,1.632879,normal control
3,GSM1501016,10.272135,0.066298,8.835539,3.166677,0.941032,0.755181,51.390227,4.168056,10.980774,...,-0.08686,1.313128,0.501803,0.357365,-0.321937,0.232894,-0.066718,-0.233382,2.117271,normal control
4,GSM1501017,8.290414,-0.039647,5.792685,2.203175,0.846344,0.612263,55.42344,3.657327,10.710821,...,0.337941,4.507792,0.120017,0.452607,-0.263615,0.443599,0.291308,-0.474034,3.032463,normal control
