# Data Cleaner

Different scripts to clean and trim our data

**TODO: make rs values integers by removing "rs"**

In [2]:
from db_scripts import *

GWAS dataset:

In [2]:
fileIn = getPath('gwas_catalog_v1.0-associations_e108_r2023-01-14.tsv') # https://www.ebi.ac.uk/gwas/docs/file-downloads
# fileOut = getPath('gwas_trimmed_beeg.tsv')
fileOut = getPath('gwas_trimmed.tsv')

data = pd.read_csv(fileIn, sep='\t', low_memory=False)    # Reads gwas tsv
data=removeSpecial(data)    # removes special characters in column names
data=data.query("disease_trait=='Type 1 diabetes' or study.str.contains('type 1 diabetes')")
data = data.loc[data.snps.str.contains(r'rs[0-9]+')]        # get only snps with rsids
#data = data.loc[data['CHR_ID']=='6']                        # Select only rows for chromosome 6

# data = data[["SNPS","REGION","CHR_POS","P_VALUE","MAPPED_GENE"]] # maybe also include STRONGEST SNP-RISK ALLELE and RISK ALLELE FREQUENCY ?
data = data[["snps","region","chr_pos","chr_id","p_value","mapped_gene"]] # maybe also include STRONGEST SNP-RISK ALLELE and RISK ALLELE FREQUENCY ?

data=removeDupeSNP(data)    # Remove duplicates (leaving the entry with largest p value)

newCol=[removeDupeGeneMap(r["mapped_gene"]) for i, r in data.iterrows()]    # Create column with duplicate gene maps removed,
data["mapped_gene"]=newCol                                                  # And replace the relevant column with this

data.rename(columns = {'snps':'rsid'}, inplace = True)

# if os.path.exists(fileOut): # If the file exists,
#     os.remove(fileOut)     # delete it.
data.to_csv(fileOut, sep='\t', index=False)

Functional data:

In [3]:
fileIn=getPath('Functional_and_Ontology_data.tsv')
# fileIn=getPath('func_data.tsv')
fileOut=getPath('Func_trimmed.tsv')


data=pd.read_csv(fileIn,sep='\t')
dupeList = data.duplicated(subset=['#Uploaded_variation','Allele'])   # Get list of duplicate values
data=data[~dupeList]
data=removeSpecial(data)
data.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)
data = data[['rsid','allele','cadd_phred', 'cadd_raw']]#, 'PolyPhen', 'SIFT']] # maybe re-add polyphen and sift later

# data=castRS(data, "Uploaded_variation") # converts rs value column to integer
data.to_csv(fileOut, sep='\t', index=False)

Population data:

In [4]:
fileIn=getPath('population_variation.tsv')
fileOut=getPath('population_variation_noSpecial.tsv')

data=pd.read_csv(fileIn,sep='\t')
data=removeSpecial(data)        # Removes special characters
data.rename(columns = {'snp_rsid':'rsid'}, inplace = True)

# data=castRS(data,"SNP_rsID")    # converts rs value column to integer
data.to_csv(fileOut, sep='\t', index=False)

Func data 2:

In [5]:
if 0:   # disable for the time being
    fileIn=getPath('Functional_and_Ontology_data.tsv')
    fileOut=getPath('Func_trimmed.tsv')

    Association_table_filename = 'Functional_and_Ontology_data.tsv'
    df = pd.read_csv(fileIn, sep='\t')
    columns_to_keep = ['#Uploaded_variation','Allele','CADD_PHRED', 'CADD_RAW', 'PolyPhen', 'SIFT']
    df = df[columns_to_keep]
    # print(df)
    df=removeSpecial(df)    # removes special characters in column names
    df.to_csv(fileOut,sep='\t', index=False)

GWAS 2:

In [6]:
fileIn = getPath("SNPS_filled.tsv")
# fileOut = getPath('gwas_trimmed_beeg.tsv')
fileOut = getPath("SNPS_filled_clean.tsv")

data = pd.read_csv(fileIn, sep='\t')# low_memory=False)    # Reads gwas tsv
data=removeSpecial(data)    # removes special characters in column names
# data=data.query("DISEASE_TRAIT=='Type 1 diabetes' or STUDY.str.contains('type 1 diabetes')")
data = data.loc[data.snps.str.contains(r'rs[0-9]+')]        # get only snps with rsids
#data = data.loc[data['CHR_ID']=='6']                        # Select only rows for chromosome 6
data = data[["snps","region","chr_pos","chr_id","p_value","mapped_gene"]] # maybe also include STRONGEST SNP-RISK ALLELE and RISK ALLELE FREQUENCY ?


data=removeDupeSNP(data)    # Remove duplicates (leaving the entry with largest p value)
data.rename(columns = {'snps':'rsid'}, inplace = True)

# if os.path.exists(fileOut): # If the file exists,
#     os.remove(fileOut)     # delete it.
data.to_csv(fileOut, sep='\t', index=False)

Ontology:

In [3]:
fileIn = getPath("GO_trimmed.csv")
# fileOut = getPath("GO_trimmed_clean.tsv")
fileOut = getPath("GO_new.tsv")

data = pd.read_csv(fileIn)
data=removeSpecial(data)    # removes special characters in column names
data.rename(columns = {'uploaded_variation':'rsid'}, inplace = True)

data2={'rsid':[],'go':[],'term':[]}
for i,row in data.iterrows():
    rs=row['rsid']
    go=row['go']
    go=go.replace('_',' ')
    go=go.split(',')

    try:
        goCols=["GO:"+str(i.split(':')[1]) for i in go]
    except:         # If data unavailable,
        continue    # skip to next row.
    try:
        termCols=[i.split(':')[2] for i in go]
    except:         # If data unavailable,
        continue    # skip to next row.

    data2['rsid'].extend([rs for _ in goCols])
    data2['go'].extend(goCols)
    data2['term'].extend(termCols)

data2=pd.DataFrame(data2)

dupeList = data2.duplicated(subset='go')   # Get list of duplicate values
data2 = data2[~dupeList]

# print(data2.head())

data2.to_csv(fileOut, sep='\t', index=False)