# Generating genomic feature tracks

In [38]:
!reference="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Cvirginica_genome/CV_working_genomic.gff"

In [39]:
pwd

'/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks'

In [28]:
genome="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Cvirginica_genome/GCF_C_virginica-3.0_genomic.fai"
!cpgList="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/genomic_bed_files/C_virginica-3.0_CG-motif.bed"
gff="/project/pi_sarah_gignouxwolfsohn_uml_edu/Reference_genomes/Cvirginica_genome/CV_working_genomic.gff"
chromLengths="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/Venkataraman_files/2018-06-15-bedtools-Chromosome-Lengths.txt"

In [10]:
!cut -f1 $chromLengths > Chromosome_Names.txt

!chromNames="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/Chromosome_Names.txt"

^C


## Genes

In [21]:
# find features with type == gene
!grep "Gnomon	gene" $gff > CV_gene.gff3

^C


38262 genes in CV genome

In [None]:
# sort for downstream use
sortBed -faidx $chromNames -i CV_gene.gff3 > CV_gene_sorted.gff3

In [40]:
!geneList="CV_gene_sorted.gff3"

In [None]:
# convert gff file to BED file
!awk '{print $1"\t"$4"\t"$5}' $geneList \
> CV_gene_sorted.bed

## Exons

In [None]:
# find features with type == exon
!grep "Gnomon	exon" $gff > CV_exon.gff3

731279 exons in CV genome

In [None]:
sortBed -faidx $chromNames -i CV_exon.gff3 > CV_exon_sorted.gff3

In [29]:
!exonList="CV_exon_sorted.gff3"

In [None]:
# convert gff file to BED file
!awk '{print $1"\t"$4"\t"$5}' $exonList \
> CV_exon_sorted.bed

## CDS

In [None]:
# find features with type == exon
!grep "Gnomon	CDS" $gff > CV_CDS.gff3

645355 CDS in CV genome

In [None]:
sortBed -faidx $chromNames -i CV_CDS.gff3 > CV_CDS_sorted.gff3

In [30]:
!cdsList="CV_CDS_sorted.gff3"

In [None]:
# convert gff file to BED file
!awk '{print $1"\t"$4"\t"$5}' $cdsList \
> CV_CDS_sorted.bed

## mRNA

In [None]:
# find features with type == exon
!grep "Gnomon	mRNA" $gff > CV_mRNA.gff3

60201 mRNA in CV genome

In [31]:
!mRNAList="CV_mRNA.gff3"

In [None]:
# convert gff file to BED file
!awk '{print $1"\t"$4"\t"$5}' $mRNAList \
> CV_mRNA.bed

## Intergenic regions
regions that aren't genes

`complementBed` to find regions that aren't genes, and `subtractBed` to remove exons and create this

In [None]:
complementBed \
-i $geneList -sorted \
-g $chromLengths \
| subtractBed \
-a - \
-b $exonList \
> CV_intergenic.gff3

In [32]:
!intergenicList="CV_intergenic.gff3"

39467 intergenic regions in CV genome

In [None]:
awk '{print $1"\t"$2"\t"$3}' $intergenicList \
> CV_intergenic.bed

## non-coding sequences

In [None]:
complementBed -i $exonList -g $chromLengths > CV_noncoding.gff3

In [33]:
!nonCDS="CV_noncoding.gff3"

336677 non-coding sequences in CV genome

In [None]:
awk '{print $1"\t"$2"\t"$3}' $nonCDS \
> CV_noncoding.bed

### introns

introns are the space between exons within a gene - so to pull this out, I have to look within a gene (LOC number), subtract the end of exon 1 from the start of exon 2


can create GFF file of non-coding regions based on the original GFF file - then introns, by definition, are the intersections of non-coding regions and genes


following pipeline from [Venkataraman et al 2020](https://www.frontiersin.org/journals/marine-science/articles/10.3389/fmars.2020.00225/full#h7)

In [None]:
# run in command line
!intersectBed \
-a $nonCDS \
-b $geneList -sorted \
> CV_intron.gff3

In [34]:
!intronList="CV_intron.gff3"

311168 introns in CV genome

In [None]:
awk '{print $1"\t"$2"\t"$3}' $intronList \
> CV_intron.bed

### exon UTRs

In [None]:
subtractBed -a $exonList -b $cdsList -sorted -g $chromLengths > CV_exonUTR.gff3

In [35]:
!exonUTR="CV_exonUTR.gff3"

9473 untranslated regions of exons in CV genome

In [None]:
awk '{print $1"\t"$4"\t"$5}' $exonUTR \
> CV_exonUTR.bed

### putative promoters
1KB upstream of transcription start site (TSS)

can use `bedtools flank` to find flanking regions 1000bp upstream and downstream of mRNA - then can filter rows to only grab the upstream flank (odd rows) on the + strand which would be our putative promoters



In [None]:
flankBed -i $mRNAList -g $chromLengths -b 1000 > mRNA_1000bp_flanks.bed

In [None]:
flanks='mRNA_1000bp_flanks.bed'

In [None]:
wc -l mRNA_1000bp_flanks.bed

120400

if row number is odd = upstream flank

if row number is even = downstream flank

In [None]:
awk '{ if (NR%2) print > "mRNA_upstream_flanks.bed"; \
else print > "mRNA_downstream_flanks.bed" }' \
$flanks

In [None]:
upstream="mRNA_upstream_flanks.bed"
downstream="mRNA_downstream_flanks.bed"

**upstream flanks**

60200 upstream flanks

**downstream flanks**
60200 downstream flanks



upstream flanks should only include + strands, downstream should only include - strands

In [None]:
grep ".	+	." $upstream \
> mRNA_upstream_forward.bed

wc -l mRNA_upstream_forward.bed

30218

In [None]:
grep ".	-	." $downstream \
> mRNA_downstream_reverse.bed

wc -l mRNA_downstream_reverse.bed

29982

now combine upstream + and downstream - flanks to create promoter track

In [None]:
cat \
mRNA_upstream_forward.bed \
mRNA_downstream_reverse.bed \
> mRNA_promoter_track.bed

In [36]:
!promoterTrack="mRNA_promoter_track.bed"

60200

# Overlaps with CpG dinucleotides

In [57]:
!conda run -n bedtools bedtools intersect -u -a $cpgList -b /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/CV_gene_sorted.bed | wc -l
#echo "total CpG dinucleotides in genes"

Error: Unable to open file -b. Exiting.

ERROR conda.cli.main_run:execute(125): `conda run bedtools intersect -u -a -b /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/CV_gene_sorted.bed` failed. (See above for error)
0


## Generating gene counts matrix
Current DESeq analysis uses a counts matrix from `featureCounts` which binned sequences into gene regions - in theory, doing the same but with `bedtools` should generate the same result, but for consistency purposes just going to generate it and do the same analysis I've done for the CpG dinucleotides in  [methylation_level_of_features.ipynb](https://github.com/jgmcdonough/CE18_methylRAD_analysis/blob/master/analysis/genomic_feature_tracks/methylation_level_of_features.ipynb)

In [1]:
geneList = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/CV_gene_sorted.gff3'

In [2]:
!bedtools multicov -bams /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/BEDtools/working_BAM_sequences/*.bam -bed {geneList}  > geneGFF_Multicov.csv

In [None]:
import os
import re
import pandas as pd

In [25]:
# read in BAM file names from the dir
bam_files = [file for file in os.listdir("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_methyl_assembly/assembly_pipeline_files/BEDtools/working_BAM_sequences/") 
              if file.endswith(".bam")]

# clean names so only sample name remains
bam_files_clean = [re.sub(r"^2018--|-CV_sorted\.bam$", "", file) for file in bam_files]
bam_files_clean

['WBV-WBR-W12',
 'BBB-WBO-B21',
 'BBR-BBG-B38',
 'BBB-WBV-B70',
 'WBY-BBV-W65',
 'WBV-WBO-W23',
 'WBO-WBV-W64',
 'WBO-BBR-W03',
 'WBB-WBV-W69',
 'BBR-BBB-B50',
 'BBY-WBG-B42',
 'WBR-BBY-W25',
 'BBO-WBV-B64',
 'BBO-BBO-B16',
 'BBO-WBO-B16',
 'WBG-BBB-W56',
 'BBO-BBY-B27',
 'BBR-BBY-B26',
 'WBY-BBY-W30',
 'WBG-WBG-W44']

In [22]:
# reading in CSV generated above
geneMulticov = pd.read_csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/analysis/genomic_feature_tracks/geneMulticov.csv', sep='\t', header = None)
geneMulticov.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,NC_035780.1,13578,14594,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NC_035780.1,28961,33324,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,NC_035780.1,43111,66897,0,0,0,0,0,0,2,...,0,0,0,0,4,2,0,0,2,0
3,NC_035780.1,85606,95254,8,12,0,0,6,4,0,...,2,1,0,0,4,0,6,0,0,10
4,NC_035780.1,99840,106460,14,4,2,0,6,0,0,...,6,2,0,0,4,0,2,0,0,2


In [24]:
# rename columns - add bam file sample names to the columns
# columns are assumed to be in same position as they are listed in the dir
geneMulticov.columns = ['chromosome', 'start', 'stop'] + bam_files_clean
geneMulticov.head()

Unnamed: 0,chromosome,start,stop,WBV-WBR-W12,BBB-WBO-B21,BBR-BBG-B38,BBB-WBV-B70,WBY-BBV-W65,WBV-WBO-W23,WBO-WBV-W64,...,BBY-WBG-B42,WBR-BBY-W25,BBO-WBV-B64,BBO-BBO-B16,BBO-WBO-B16,WBG-BBB-W56,BBO-BBY-B27,BBR-BBY-B26,WBY-BBY-W30,WBG-WBG-W44
0,NC_035780.1,13578,14594,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NC_035780.1,28961,33324,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,NC_035780.1,43111,66897,0,0,0,0,0,0,2,...,0,0,0,0,4,2,0,0,2,0
3,NC_035780.1,85606,95254,8,12,0,0,6,4,0,...,2,1,0,0,4,0,6,0,0,10
4,NC_035780.1,99840,106460,14,4,2,0,6,0,0,...,6,2,0,0,4,0,2,0,0,2


In [30]:
# create base columns
base_cols = geneMulticov[["chromosome", "start", "stop"]]

############################################################
## HYPOXIC CONTROL
# Additional columns matching pattern
HC_cols = geneMulticov.filter(regex="BB.-WB.-\\w{3}")

# Create new data frame
HC_multicov = pd.concat([base_cols, HC_cols], axis=1)

############################################################
## HYPOXIC HYPOXIC
# Additional columns matching pattern
HH_cols = geneMulticov.filter(regex="BB.-BB.-\\w{3}")

# Create new data frame
HH_multicov = pd.concat([base_cols, HH_cols], axis=1)

############################################################
## CONTROL HYPOXIC
# Additional columns matching pattern
CH_cols = geneMulticov.filter(regex="WB.-BB.-\\w{3}")

# Create new data frame
CH_multicov = pd.concat([base_cols, CH_cols], axis=1)

############################################################
## CONTROL CONTROL
# Additional columns matching pattern
CC_cols = geneMulticov.filter(regex="WB.-WB.-\\w{3}")

# Create new data frame
CC_multicov = pd.concat([base_cols, CC_cols], axis=1)