#Obesity Data Preprocessing

## Import packages

In [None]:
!pip install bed-reader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bed-reader
  Downloading bed_reader-0.2.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting chardet>=5.1.0 (from bed-reader)
  Downloading chardet-5.1.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: chardet, bed-reader
  Attempting uninstall: chardet
    Found existing installation: chardet 4.0.0
    Uninstalling chardet-4.0.0:
      Successfully uninstalled chardet-4.0.0
Successfully installed bed-reader-0.2.36 chardet-5.1.0




In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from bed_reader import to_bed, tmp_path, open_bed, sample_file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/My Drive/Bioinformatics/Obesity_NGS/Obesity/')

## Meta data consideration

In [None]:
# Load meta data
obs = pd.read_csv('SraRunTable.txt')

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
obs.head()

Unnamed: 0,Run,Age,Assay Type,AssemblyName,AvgSpotLen,Bases,BIOMATERIAL_PROVIDER,BioProject,BioSample,BioSampleModel,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,Instrument,Isolate,Library Name,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,Sample Name,sex,SRA Study,tissue
0,SRR6996662,46.619178,AMPLICON,GCA_000001405.13,162,16736251,"Chang\, SC",PRJNA449974,SAMN08924187,Human,11018904,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931452,Ion Torrent PGM,CGMH,OBL_067,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_067,male,SRP139885,Blood
1,SRR6996663,47.221918,AMPLICON,GCA_000001405.13,164,12417372,"Chang\, SC",PRJNA449974,SAMN08924186,Human,8309201,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931451,Ion Torrent PGM,CGMH,OBL_066,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_066,male,SRP139885,Blood
2,SRR6996664,57.441096,AMPLICON,GCA_000001405.13,153,15245589,"Chang\, SC",PRJNA449974,SAMN08924185,Human,10191254,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931450,Ion Torrent PGM,CGMH,OBL_065,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_065,male,SRP139885,Blood
3,SRR6996665,49.950685,AMPLICON,GCA_000001405.13,156,22546458,"Chang\, SC",PRJNA449974,SAMN08924184,Human,14776552,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931449,Ion Torrent PGM,CGMH,OBL_064,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_064,male,SRP139885,Blood
4,SRR6996666,50.906849,AMPLICON,GCA_000001405.13,155,20378605,"Chang\, SC",PRJNA449974,SAMN08924183,Human,13410438,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931448,Ion Torrent PGM,CGMH,OBL_063,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_063,male,SRP139885,Blood


In [None]:
obs['AvgSpotLen'].unique()

array([162, 164, 153, 156, 155, 152, 158, 157, 165, 124, 167, 139, 149,
       159, 169, 170, 163, 177, 151, 150, 144, 160, 172, 182, 175, 174,
       146, 184, 154, 187, 161, 127, 148, 142, 171, 138, 178, 166, 129,
       136, 132, 173, 180, 186, 185, 140, 176, 179, 133])

In [None]:
lib_name = obs['Library Name']

In [None]:
# Count number of obesity and nonobesity
obl = 0
obh = 0
for x in lib_name:
    if 'OBL' in x:
        obl += 1
    elif 'OBH' in x:
        obh += 1
print ('nonobesity, obl:', obl, '; obisity, obh:', obh)

nonobesity, obl: 64 ; obisity, obh: 75


## Write NGS genotype-phenotype data to PLINK

In [None]:
# Load genotype data after implemeting GATK
geno = pd.read_csv('geno_snps', sep = '\s+', header = None)

In [None]:
geno

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143
0,chr1,1152303,rs9442380,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
1,chr1,4918530,.,A,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
2,chr1,10379664,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
3,chr1,18236545,rs6660120,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
4,chr1,18236600,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,chrX,37986330,rs12852089,G,A,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
611,chrX,56860352,rs1927218,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
612,chrX,64030265,.,T,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
613,chrX,150240446,.,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.


In [None]:
# Add column names
cl = ['CHR', 'POS', 'rsID', 'Allele_1', 'Allele_2'] + list(obs['Run'])

In [None]:
df = geno.set_axis(cl, axis = 1)

In [None]:
df

Unnamed: 0,CHR,POS,rsID,Allele_1,Allele_2,SRR6996662,SRR6996663,SRR6996664,SRR6996665,SRR6996666,SRR6996667,SRR6996668,SRR6996669,SRR6996670,SRR6996671,SRR6996672,SRR6996673,SRR6996674,SRR6996675,SRR6996676,SRR6996677,SRR6996678,SRR6996679,SRR6996680,SRR6996681,SRR6996682,SRR6996683,SRR6996684,SRR6996685,SRR6996686,SRR6996687,SRR6996688,SRR6996689,SRR6996690,SRR6996691,SRR6996692,SRR6996693,SRR6996694,SRR6996695,SRR6996696,SRR6996697,SRR6996698,SRR6996699,SRR6996700,SRR6996701,SRR6996702,SRR6996703,SRR6996704,SRR6996705,SRR6996706,SRR6996707,SRR6996708,SRR6996709,SRR6996710,SRR6996711,SRR6996712,SRR6996713,SRR6996714,SRR6996715,SRR6996716,SRR6996717,SRR6996718,SRR6996719,SRR6996720,SRR6996721,SRR6996722,SRR6996723,SRR6996724,SRR6996725,SRR6996726,SRR6996727,SRR6996728,SRR6996729,SRR6996730,SRR6996731,SRR6996732,SRR6996733,SRR6996734,SRR6996735,SRR6996736,SRR6996737,SRR6996738,SRR6996739,SRR6996740,SRR6996741,SRR6996742,SRR6996743,SRR6996744,SRR6996745,SRR6996746,SRR6996747,SRR6996748,SRR6996749,SRR6996750,SRR6996751,SRR6996752,SRR6996753,SRR6996754,SRR6996755,SRR6996756,SRR6996757,SRR6996758,SRR6996759,SRR6996760,SRR6996761,SRR6996762,SRR6996763,SRR6996764,SRR6996765,SRR6996766,SRR6996767,SRR6996768,SRR6996769,SRR6996770,SRR6996771,SRR6996772,SRR6996773,SRR6996774,SRR6996775,SRR6996776,SRR6996777,SRR6996778,SRR6996779,SRR6996780,SRR6996781,SRR6996782,SRR6996783,SRR6996784,SRR6996785,SRR6996786,SRR6996787,SRR6996788,SRR6996789,SRR6996790,SRR6996791,SRR6996792,SRR6996793,SRR6996794,SRR6996795,SRR6996796,SRR6996797,SRR6996798,SRR6996799,SRR6996800
0,chr1,1152303,rs9442380,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
1,chr1,4918530,.,A,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
2,chr1,10379664,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
3,chr1,18236545,rs6660120,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
4,chr1,18236600,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,chrX,37986330,rs12852089,G,A,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
611,chrX,56860352,rs1927218,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
612,chrX,64030265,.,T,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
613,chrX,150240446,.,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.


In [None]:
# Encode bi-allelic genotypes by 0, 1, 2
def get_genotype(data):
    N, D = data.shape
    drop_index = []
    for i in range(N):
        for j in range(5, D):
            if data.iloc[i, j] == "./." or data.iloc[i, j] == ".|.":
                data.iloc[i,j] = np.nan
            elif data.iloc[i, j] == "0/0" or data.iloc[i, j] == "0|0":
                data.iloc[i,j] = 0
            elif data.iloc[i, j] == "0/1" or data.iloc[i, j] == "0|1":
                data.iloc[i,j] = 1
            elif data.iloc[i, j] == "1/1" or data.iloc[i, j] == "1|1":
                data.iloc[i,j] = 2
            else:
                drop_index.append(i)
    unique_drop_index = np.unique(drop_index)
    df = data.drop(index = unique_drop_index)
    return (df)

In [None]:
data = get_genotype(df)

In [None]:
data

Unnamed: 0,CHR,POS,rsID,Allele_1,Allele_2,SRR6996662,SRR6996663,SRR6996664,SRR6996665,SRR6996666,SRR6996667,SRR6996668,SRR6996669,SRR6996670,SRR6996671,SRR6996672,SRR6996673,SRR6996674,SRR6996675,SRR6996676,SRR6996677,SRR6996678,SRR6996679,SRR6996680,SRR6996681,SRR6996682,SRR6996683,SRR6996684,SRR6996685,SRR6996686,SRR6996687,SRR6996688,SRR6996689,SRR6996690,SRR6996691,SRR6996692,SRR6996693,SRR6996694,SRR6996695,SRR6996696,SRR6996697,SRR6996698,SRR6996699,SRR6996700,SRR6996701,SRR6996702,SRR6996703,SRR6996704,SRR6996705,SRR6996706,SRR6996707,SRR6996708,SRR6996709,SRR6996710,SRR6996711,SRR6996712,SRR6996713,SRR6996714,SRR6996715,SRR6996716,SRR6996717,SRR6996718,SRR6996719,SRR6996720,SRR6996721,SRR6996722,SRR6996723,SRR6996724,SRR6996725,SRR6996726,SRR6996727,SRR6996728,SRR6996729,SRR6996730,SRR6996731,SRR6996732,SRR6996733,SRR6996734,SRR6996735,SRR6996736,SRR6996737,SRR6996738,SRR6996739,SRR6996740,SRR6996741,SRR6996742,SRR6996743,SRR6996744,SRR6996745,SRR6996746,SRR6996747,SRR6996748,SRR6996749,SRR6996750,SRR6996751,SRR6996752,SRR6996753,SRR6996754,SRR6996755,SRR6996756,SRR6996757,SRR6996758,SRR6996759,SRR6996760,SRR6996761,SRR6996762,SRR6996763,SRR6996764,SRR6996765,SRR6996766,SRR6996767,SRR6996768,SRR6996769,SRR6996770,SRR6996771,SRR6996772,SRR6996773,SRR6996774,SRR6996775,SRR6996776,SRR6996777,SRR6996778,SRR6996779,SRR6996780,SRR6996781,SRR6996782,SRR6996783,SRR6996784,SRR6996785,SRR6996786,SRR6996787,SRR6996788,SRR6996789,SRR6996790,SRR6996791,SRR6996792,SRR6996793,SRR6996794,SRR6996795,SRR6996796,SRR6996797,SRR6996798,SRR6996799,SRR6996800
0,chr1,1152303,rs9442380,T,C,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,chr1,4918530,.,A,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,
2,chr1,10379664,.,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,
3,chr1,18236545,rs6660120,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,chr1,18236600,.,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,chrX,37986330,rs12852089,G,A,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
611,chrX,56860352,rs1927218,T,C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
612,chrX,64030265,.,T,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
613,chrX,150240446,.,A,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# Collect sex data
s = list(obs['sex'])
se = []
for x in s:
    if x == 'male':
        se.append(1)
    elif x == 'female':
        se.append(2)
    else:
        se.append(-9)

In [None]:
# Collect phenotype data
pn = list(obs['Sample Name'])
pheno = []
for x in pn:
    if 'OBL' in x:
        pheno.append(1)
    if 'OBH' in x:
        pheno.append(2)

In [None]:
# Collect rsID and idd
rsid = list(data['rsID'])
idd = list(obs['Run'])
n = len(idd)
l = len(rsid)

In [None]:
# Write NGS data to PLINK files
output_file = tmp_path() / "obs_ngs.bed"
val = data.iloc[:, 5::].T.values.astype(np.float32)
properties = {
   "fid": idd,
   "iid": idd,
   "father": [0]*n,
   "mother": [0]*n,
   "sex": se,
   "pheno": pheno,
   "chromosome": list(data['CHR']),
   "sid": rsid,
   "cm_position": [0]*l,
   "bp_position": list(data['POS']),
   "allele_1": list(data['Allele_1']),
   "allele_2": list(data['Allele_2']),
}
to_bed("obs_ngs.bed", val, properties=properties)

## Check data after the subsequent data preprocessing

In [None]:
# Load bed files after QC
bed_qc = open_bed('obs_ngs.QC.bed')
val_qc = bed_qc.read()

In [None]:
val_qc

array([[0., 0., 0., ..., 1., 0., 2.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 2., 0., 1.],
       [1., 0., 0., ..., 0., 2., 1.]], dtype=float32)

In [None]:
len(val_qc)

139

In [None]:
val_qc.shape

(139, 135)

In [None]:
np.isnan(val_qc)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [None]:
np.unique(val_qc)

array([ 0.,  1.,  2., nan], dtype=float32)

In [None]:
# Load the bim file after QC
ngs_bim = pd.read_csv('obs_ngs.QC.bim', sep = '\s+', header = None)

In [None]:
ngs_bim

Unnamed: 0,0,1,2,3,4,5
0,1,rs11208659,0,65513597,T,C
1,1,rs3101337,0,72285451,T,C
2,1,rs3101336,0,72285502,C,T
3,1,rs9425089,0,72299399,A,C
4,1,rs2568958,0,72299433,A,G
...,...,...,...,...,...,...
130,19,rs29941,0,33818627,A,G
131,19,rs442398,0,33831087,A,G
132,19,rs11084753,0,33831232,A,G
133,20,rs13041126,0,52476457,T,C


In [None]:
# Generate SNP list file after QC
ngs_bim.iloc[:, 1].to_csv('rsID.filename', header = None, index = False)

## Prepare data for training model

In [None]:
# Read genotype-phenotype data
obs_data = pd.read_csv('obs_snp.QC.txt', sep = '\s+')
obs = obs_data.iloc[:, 9::]
obs['ID'] = obs_data.iloc[:, 2]
obs.set_index('ID', inplace = True)
list_col = obs.columns
l_col = []
for e in list_col:
    l_col.append(e.split("_")[0])

In [None]:
obs = obs.T.reset_index()

In [None]:
obs = obs.iloc[:, 1::]
obs['sample'] = l_col
obs.set_index('sample', inplace = True)

In [None]:
# Data for training
obs

ID,rs11208659,rs3101337,rs3101336,rs9425089,rs2568958,rs2815752,rs10789336,rs4322186,rs1514176,rs1514175,...,rs476828,rs12970134,rs477181,rs502933,rs4450508,rs29941,rs442398,rs11084753,rs13041126,rs4823006
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR6996662,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,1/1
SRR6996663,0/0,0/0,0/0,0/0,0/0,0/0,./.,0/0,0/1,0/1,...,0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/0,0/1
SRR6996664,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1
SRR6996665,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1
SRR6996666,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,1/1,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6996796,0/0,0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/0,0/0,...,0/0,0/0,0/0,0/1,0/1,0/0,0/0,0/0,0/1,0/0
SRR6996797,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,...,0/0,0/0,0/0,0/0,0/0,0/1,1/1,1/1,0/1,0/1
SRR6996798,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/1
SRR6996799,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,...,0/1,0/1,0/1,0/1,0/1,1/1,0/1,1/1,0/0,0/1


In [None]:
# Encode genotype to 0, 1, 2
pre_obs = obs.replace(['0/0', './.', '0/1', '1/1'], [0, 0, 1, 2])

In [None]:
pre_obs

ID,rs11208659,rs3101337,rs3101336,rs9425089,rs2568958,rs2815752,rs10789336,rs4322186,rs1514176,rs1514175,...,rs476828,rs12970134,rs477181,rs502933,rs4450508,rs29941,rs442398,rs11084753,rs13041126,rs4823006
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR6996662,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,2
SRR6996663,0,0,0,0,0,0,0,0,1,1,...,1,1,1,1,1,1,1,1,0,1
SRR6996664,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,1
SRR6996665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
SRR6996666,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6996796,0,1,1,1,1,1,1,1,0,0,...,0,0,0,1,1,0,0,0,1,0
SRR6996797,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,1,2,2,1,1
SRR6996798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,1
SRR6996799,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,2,1,2,0,1


In [None]:
# Read fam file after QC
obs_fam = pd.read_csv('obs_ngs.QC.fam',header = None, sep = '\s+')

In [None]:
obs_fam

Unnamed: 0,0,1,2,3,4,5
0,SRR6996662,SRR6996662,0,0,1,1
1,SRR6996663,SRR6996663,0,0,1,1
2,SRR6996664,SRR6996664,0,0,1,1
3,SRR6996665,SRR6996665,0,0,1,1
4,SRR6996666,SRR6996666,0,0,1,1
...,...,...,...,...,...,...
134,SRR6996796,SRR6996796,0,0,2,1
135,SRR6996797,SRR6996797,0,0,2,1
136,SRR6996798,SRR6996798,0,0,2,1
137,SRR6996799,SRR6996799,0,0,1,1


In [None]:
# Add sex and phenotype
pre_obs['Sex'] = list(obs_fam.iloc[:, 4])
pre_obs['Phenotype'] = list(obs_fam.iloc[:, 5])

In [None]:
pre_obs

ID,rs11208659,rs3101337,rs3101336,rs9425089,rs2568958,rs2815752,rs10789336,rs4322186,rs1514176,rs1514175,...,rs477181,rs502933,rs4450508,rs29941,rs442398,rs11084753,rs13041126,rs4823006,Sex,Phenotype
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR6996662,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,1,0,2,1,1
SRR6996663,0,0,0,0,0,0,0,0,1,1,...,1,1,1,1,1,1,0,1,1,1
SRR6996664,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,1,1,1
SRR6996665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
SRR6996666,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6996796,0,1,1,1,1,1,1,1,0,0,...,0,1,1,0,0,0,1,0,2,1
SRR6996797,0,0,0,0,0,0,0,0,1,1,...,0,0,0,1,2,2,1,1,2,1
SRR6996798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,1,2,1
SRR6996799,0,0,0,0,0,0,0,0,0,0,...,1,1,1,2,1,2,0,1,1,1


In [None]:
# Split original data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(pre_obs.iloc[:,0:-1], pre_obs.iloc[:,-1], test_size=0.2, random_state=42)

In [None]:
# Save training and testing data
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')