# Deserts of Introgression 

In [1]:
import pandas as pd
from pybedtools import BedTool

**INDEX**

**Data processing**

Human high-frequency changes within Deserts

Human high-frequency changes not in Deserts

**1. Deserts vs NO Deserts (Whole genome)** 
    
    Difference mean frequency
    Difference percentage fixed alleles
    
**2. Mean frequency Deserts vs NO Deserts (per chromosome)** 

    Difference mean frequency
    Difference percentage fixed alleles
    
**3. Genes associated to fixed alleles within Deserts**


### **Data processing** 

#### Intersection hHF with Deserts 

##### In-house deserts and HF bed files 

In [2]:
akey = BedTool('2020_akeydeserts_coords.bed')
print(len(akey))
hf = BedTool('2020_Nahigh_freq.bed')
print(len(hf))

4
136414


In [3]:
w_hf = akey.intersect(hf, wo=True)
print(len(w_hf))

3652


#### Intersection hHF within no Deserts 

In [4]:
wo_hf = hf.intersect(akey, wo=True, v=True)
print(len(wo_hf))

132762


#### Result | Intersection hHF with Deserts: Bed files

In [5]:
w_hf.saveas("Nahigh_freq_WITHIN_deserts.bed")
wo_hf.saveas("Nahigh_freq_NOT_in_deserts.bed")

<BedTool(Nahigh_freq_NOT_in_deserts.bed)>

### **1. COMPARISON | Deserts vs NO Deserts (Whole genome):** 

In [6]:
df1 = pd.read_csv("Nahigh_freq_WITHIN_deserts.bed", sep='\t', header=None)
df2 = pd.read_csv("Nahigh_freq_NOT_in_deserts.bed", sep='\t', header=None)

In [7]:
df1.drop([7], axis=1, inplace=True)

#### Pandas dataframes 

In [8]:
df1.columns = ['chr', 'Desert_start','Desert_end', 'chr_hg19', 'POS_start', 'POS_end', 'dbSNP']
df2.columns = ['chr', 'POS_start', 'POS_end', 'dbSNP']

In [9]:
martin = pd.read_csv("Na_high_freq.tsv", sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
martin_c = martin[['dbSNP', 'human_DAF', 'Gene_name']]

In [11]:
#Remove Nan values
df1_f = df1[~df1.dbSNP.str.contains("Nan")]
print(len(df1_f))
df1_wF = martin_c.merge(df1_f, on = ['dbSNP'], how='right')
df1_wF.head(2)

3199


Unnamed: 0,dbSNP,human_DAF,Gene_name,chr,Desert_start,Desert_end,chr_hg19,POS_start,POS_end
0,rs75934053,0.99401,,chr1,105399999,120600000,chr1,105415054,105415055
1,rs75782466,0.978235,,chr1,105399999,120600000,chr1,105431905,105431906


#### **Result | Mean frequency HF SNP in Deserts:** 

In [12]:
df1_wF['human_DAF'].mean()

0.9656156648952797

In [13]:
#Remove Nan values
df2_f = df2[~df2.dbSNP.str.contains("Nan")]
print(len(df2_f))
df2_wF = martin_c.merge(df2_f, on = ['dbSNP'], how='right')

124842


#### **Result | Mean frequency HF SNP in NO Deserts (whole genome):** 

In [14]:
df2_wF['human_DAF'].mean()

0.9614257742025918

##### Result | Mean frequency HF SNP difference deserts vs NO deserts

In [15]:
df1_wF['human_DAF'].mean()-df2_wF['human_DAF'].mean()

0.0041898906926879365

#### **Result | Percentage of fixed alleles in Deserts:** 

In [16]:
len(df1_wF[df1_wF['human_DAF'] == 1])/len(df1_wF)*100

4.3763676148796495

#### **Result | Percentage of fixed alleles in NO Deserts (whole genome):** 

In [17]:
len(df2_wF[df2_wF['human_DAF'] == 1])/len(df2_wF)*100

2.875634802390221

##### Result | Difference in Proportion fixed alleles Deserts vs NO Deserts (whole genome):

In [18]:
(len(df1_wF[df1_wF['human_DAF'] == 1])/len(df1_wF)*100)/(len(df2_wF[df2_wF['human_DAF'] == 1])/len(df2_wF)*100)

1.5218787904646385

### **2. COMPARISON | Deserts vs NO Deserts (Per chromosome):** 

In [19]:
##Desert chromosomes: 
### df1_wF['chr'].unique() --> array(['chr1', 'chr3', 'chr7', 'chr8'], dtype=object)
df2_wF_chr = df2_wF[(df2_wF['chr'] == 'chr1') | (df2_wF['chr'] == 'chr3') | (df2_wF['chr'] == 'chr7') | (df2_wF['chr'] == 'chr8')]
print(len(df2_wF_chr))

29374


#### **Result | Mean frequency HF SNP in NO Deserts (per chr):** 

In [20]:
df2_wF_chr['human_DAF'].mean()

0.9618477823585484

##### Result | Mean frequency HF SNP difference deserts vs NO deserts (per chr)

In [21]:
df1_wF['human_DAF'].mean()-df2_wF_chr['human_DAF'].mean()

0.0037678825367313262

#### **Result | Percentage of fixed alleles in NO Deserts (per chr):** 

In [22]:
len(df2_wF_chr[df2_wF_chr['human_DAF'] == 1])/len(df2_wF_chr)*100

2.99925103833322

##### Result | Difference in Proportion fixed alleles Deserts vs NO Deserts (per chr):

In [23]:
(len(df1_wF[df1_wF['human_DAF'] == 1])/len(df1_wF)*100)/(len(df2_wF_chr[df2_wF_chr['human_DAF'] == 1])/len(df2_wF_chr)*100)

1.459153488302779

### **3. Genes associated to *fixed alleles within Deserts*:** 

In [24]:
genesnames_fixed = pd.DataFrame(df1_wF[df1_wF['human_DAF'] == 1]['Gene_name'].unique())

Need to reformat the data (column contains gene names separated by semicolon)

In [25]:
genesnames_fixed[0] = genesnames_fixed[0].astype(str)

In [26]:
genesnames_fixed=pd.DataFrame(genesnames_fixed[0].str.split(";").tolist()).stack()

In [27]:
genesnames_fixed.reset_index(drop=True, inplace=True)

In [28]:
genesnames_fixed = genesnames_fixed[~genesnames_fixed.str.contains("nan")]

In [29]:
genesnames_fixed.drop_duplicates(inplace=True)
genesnames_fixed.reset_index(drop=True, inplace=True)

##### **Result | Genes associated to fixed alleles within Deserts** (Martin & Boeckx 2019 SNP-gene association)

In [30]:
print(genesnames_fixed.head(10))
print(len(genesnames_fixed))

0           VAV3
1          GPSM2
2         AKNAD1
3          KCNC4
4        FAM19A3
5    RP3-522D1.1
6        SLC16A1
7    SLC16A1-AS1
8         VANGL1
9       SLC22A15
dtype: object
49


In [31]:
genesnames_fixed.to_csv("Fixed_alleles_inDeserts_Genenames.csv", header=True, index=False, sep='\t')

### Permutation test (R code)

### biomaRt - Bioconductor: *Extracting gene names from Akey deserts* (R code) 