# Data Cleaner

Different scripts to clean and trim our data

In [1]:
from db_scripts import *

Functional data:

In [2]:
fileIn=getPath('Func_trimmed.csv')

func=pd.read_csv(fileIn)
# Get list of duplicate values
dupeList = func.duplicated(subset=['Uploaded_variation','Allele'])
# Select only non-duplicates
func=func[~dupeList]
func=removeSpecial(func)
func.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)
func = func[['rsid','allele','cadd_phred', 'cadd_raw']]

Population data:

In [3]:
fileIn=getPath('population_variation.tsv')

pop=pd.read_csv(fileIn,sep='\t')
pop=removeSpecial(pop) # Removes special characters
pop.rename(columns = {'snp_rsid':'rsid'}, inplace = True)

GWAS:

In [4]:
fileIn = getPath("T1D_GWAS_add.tsv")

gwas = pd.read_csv(fileIn, sep='\t') # Reads gwas tsv
gwas.rename(columns = {'snps':'rsid','unnamed_0':'i'}, inplace = True)
gwas=gwas.astype({'chr_id':'int64'})
gwas=removeSpecial(gwas)  # removes special characters in column names
gwas = gwas.loc[gwas.rsid.str.contains(r'rs[0-9]+')] # get only snps with rsids
gwas=removeDupeSNP(gwas) # Remove duplicates (leaving the entry with largest p value)

Ontology:

In [5]:
fileIn = getPath("GO_trimmed.csv")

ont = pd.read_csv(fileIn)
ont=removeSpecial(ont)    # removes special characters in column names
ont.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)

ont2={'rsid':[],'go':[],'term':[]}
for i,row in ont.iterrows():
    rs=row['rsid']
    go=row['go']
    go=go.replace('_',' ')
    go=go.split(',')

    try:
        goCols=["GO:"+str(i.split(':')[1]) for i in go]
        termCols=[i.split(':')[2] for i in go]
    except:         # If data unavailable,
        continue    # skip to next row.

    ont2['rsid'].extend([rs for _ in goCols])
    ont2['go'].extend(goCols)
    ont2['term'].extend(termCols)

ont=pd.DataFrame(ont2)

Create DB

In [6]:
DB=DBpath()

if os.path.exists(DB):      # If the file exists,
    os.remove(DB)           # delete it.

pdDB(gwas, "gwas",       {"rsid":"TEXT PRIMARY KEY", "chr_id":"INTEGER","chr_pos":"INTEGER"})
pdDB(pop,  "population", {"rsid":"TEXT REFERENCES gwas(rsid)"})
pdDB(func, "functional", {"rsid":"TEXT REFERENCES gwas(rsid)"})
pdDB(ont,  "ontology",   {"rsid":"TEXT"})

print("\ndone\n")


done

