# Data Cleaner

Different scripts to clean and trim our data

In [1]:
from db_scripts import *

Functional data:

In [2]:
fileIn=getPath('Func_trimmed.csv')
fileOut=getPath('Func_trimmed_clean.tsv')

data=pd.read_csv(fileIn)
# Get list of duplicate values
dupeList = data.duplicated(subset=['Uploaded_variation','Allele'])
# Select only non-duplicates
data=data[~dupeList]
data=removeSpecial(data)
data.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)
data = data[['rsid','allele','cadd_phred', 'cadd_raw']]

data.to_csv(fileOut, sep='\t', index=False)

Population data:

In [3]:
fileIn=getPath('population_variation.tsv')
fileOut=getPath('population_variation_noSpecial.tsv')

data=pd.read_csv(fileIn,sep='\t')
data=removeSpecial(data) # Removes special characters
data.rename(columns = {'snp_rsid':'rsid'}, inplace = True)

data.to_csv(fileOut, sep='\t', index=False)

GWAS:

In [4]:
fileIn = getPath("T1D_GWAS_add.tsv")
fileOut = getPath("T1D_GWAS_add_clean.tsv")

data = pd.read_csv(fileIn, sep='\t') # Reads gwas tsv
data.rename(columns = {'snps':'rsid','unnamed_0':'i'}, inplace = True)
dat=data.astype({'chr_id':'int64'})
data=removeSpecial(data)  # removes special characters in column names
data = data.loc[data.rsid.str.contains(r'rs[0-9]+')] # get only snps with rsids
data=removeDupeSNP(data) # Remove duplicates (leaving the entry with largest p value)

data.to_csv(fileOut, sep='\t', index=False)

Ontology:

In [5]:
fileIn = getPath("GO_trimmed.csv")
fileOut = getPath("GO_new.tsv")

data = pd.read_csv(fileIn)
data=removeSpecial(data)    # removes special characters in column names
data.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)

data2={'rsid':[],'go':[],'term':[]}
for i,row in data.iterrows():
    rs=row['rsid']
    go=row['go']
    go=go.replace('_',' ')
    go=go.split(',')

    try:
        goCols=["GO:"+str(i.split(':')[1]) for i in go]
        termCols=[i.split(':')[2] for i in go]
    except:         # If data unavailable,
        continue    # skip to next row.

    data2['rsid'].extend([rs for _ in goCols])
    data2['go'].extend(goCols)
    data2['term'].extend(termCols)

data2=pd.DataFrame(data2)

dupeList = data2.duplicated(subset='go')   # Get list of duplicate values
data2 = data2[~dupeList]

data2.to_csv(fileOut, sep='\t', index=False)

Create DB

In [6]:
gwas = getPath("T1D_GWAS_add_clean.tsv")
pop  = getPath("population_variation_noSpecial.tsv")
func = getPath('Func_trimmed.tsv')
ont  = getPath("GO_new.tsv")
DB=DBpath()

if os.path.exists(DB):      # If the file exists,
    os.remove(DB)           # delete it.

pdDB(gwas, "gwas",       {"rsid":"TEXT PRIMARY KEY", "chr_id":"INTEGER","chr_pos":"INTEGER"})
pdDB(pop,  "population", {"rsid":"TEXT REFERENCES gwas(rsid)"})
pdDB(func, "functional", {"rsid":"TEXT REFERENCES gwas(rsid)"})
pdDB(ont,  "ontology",   {"rsid":"INTEGER REFERENCES gwas(rsid)"})

print("\ndone\n")


done

