# Lung Function Project - SNP lookup
Parse two sets of GWAS results --- (1) Forced Vital Capacity and (2) Forced Expiratory volume in 1 second--- to extract the results of the following 2 SNPs: rs11693320 (chr. 2) and rs7795837 (chr. 7). The GWAS results are  from the following website:
https://sites.google.com/broadinstitute.org/ukbbgwasresults/home?authuser=0 

The downloaded GWAS results are located at the following path:

** //rcdcollaboration01.rti.ns/GxG/R21_GxNutrients/PriorGWASresults/UKBiobank **
 
The GWAS results have been zipped (gzip) back. The results are shown in this notebook. Alternatively, the results are in two text files at the path:

 
** //rcdcollaboration01.rti.ns/GxG/R21_GxNutrients/PriorGWASresults/UKBiobank/SNP_finder/test **
 
The two test files are SNP-results-1.txt and SNP-results-2.txt which correspond to Forced Vital Capacity & Forced Expiratory Volume in 1-second, rspectively. 

## Forced Vital Capacity

In [None]:
import pandas as pd
import os
os.chdir("//rcdcollaboration01.rti.ns/gxg/R21_GxNutrients/PriorGWASresults/UKBiobank/SNP_finder/test/")
vital_cap = pd.read_csv("SNP-results-1.txt", sep='\t')
vital_cap

## Forced Expiratory Volume in 1-second

In [None]:
expiratory_vol = pd.read_csv("SNP-results-2.txt", sep='\t')
expiratory_vol

# Code & Commands 

### Command Line 

In [None]:
# download Forced vital capacity and Forced expiratory volume in 1-second, respectively
cd //rcdcollaboration01.rti.ns/GxG/R21_GxNutrients/PriorGWASresults/UKBiobank
wget https://www.dropbox.com/s/5d1x9vwzf3jj2v9/3062.assoc.tsv.gz?dl=0 -O 3062.assoc.tsv.gz
wget https://www.dropbox.com/s/ujjmyxwbs0o1wgp/3063.assoc.tsv.gz?dl=0 -O 3063.assoc.tsv.gz

#make a copy of the directory that contains the perl script that extracts the SNP results
cp /cygdrive/c/Users/jmarks/Desktop/Code/SNP_finder
cd SNP_finder

# run the following lines in the terminal - results output to a folder called test
j=0
for i in /cygdrive/c/Users/jmarks/rcdcollaboration01.rti.ns/gxg/R21_GxNutrients/PriorGWASresults/UKBiobank/306{2..3}.assoc.tsv
do 
let j++
perl extract_rows.pl \
--source $i \
--id_list SNP_ids.txt \
--out test/SNP-results-$j.txt \
--header 1 \
--id_column 1
done

### Perl Script (extract_rows.pl)

In [None]:
#!/usr/bin/perl

# Example:
# perl /share/nas02/bioinformatics_group/software/perl/extract_rows.pl \
#  --source /share/nas02/bioinformatics_group/data/ref_panels/hapmap_phase_3/beagle/ALL/hapmap3_r2_b36_chr22.5MB_chunk.0.marker \
#  --id_list /share/nas02/bioinformatics_group/data/ref_panels/hapmap_phase_3/beagle/ALL/hapmap3_r2_b36_chr22.5MB_chunk.0.keep_snps \
#  --out /share/nas02/bioinformatics_group/data/ref_panels/hapmap_phase_3/beagle/ALL/hapmap3_r2_b36_chr22.5MB_chunk.0.keep.marker \
#  --header 0 \
#  --id_column 0

use strict;
use warnings;
use Getopt::Long;
use constant FALSE => 0;
use constant TRUE  => 1;

# Autoflush STDOUT
select((select(STDOUT), $|=1)[0]);

my $fileSource = '';			
my $fileIdList = '';
my $fileOut = '';
my $remove = FALSE;
my $sourceHeaderRowCount = 0;
my $sourceIdColumn = 0;

GetOptions ('source=s' => \$fileSource,					# Name of file from which rows will be extracted
			'id_list=s' => \$fileIdList,				# Name of file containing list of identifiers for rows to be extracted or removed
			'out=s' => \$fileOut,						# Name of output file
			'remove' => \$remove,				 		# Rows corresponding to items in the ID list will be removed rather than extracted
			'header:i' => \$sourceHeaderRowCount,		# Number of header rows in source file
			'id_column:i' => \$sourceIdColumn);			# Column in source file containing ID (column numbering starts with 0)

if ($sourceIdColumn < 0) {
	die "Invalid --id_column\n";
}

my %idList = ();		# IDs of rows to extract or remove
my $row = 0;			# Current line number in source file
my $inIdList = FALSE;	# Whether the ID for the current row is in the id list

# Read in ID list
print "Reading ID list...\n";
if ($fileIdList =~ /\.gz$/) {
	open(FILE_ID_LIST, "gunzip -c $fileIdList |") or die "Cannot open ".$fileIdList."\n";
} else {
	open(FILE_ID_LIST, $fileIdList) or die "Cannot open ".$fileIdList."\n";
}
while (<FILE_ID_LIST>){
	chomp;
	my $new_id = $_;
	$new_id =~ s/^\s+|\s+$//g;
	print "\n".$new_id;
	$idList{$new_id} = 1;
}
close FILE_ID_LIST;

# Process source file
print "Extracting rows...\n";
open(FILE_OUT, ">".$fileOut) or die "Cannot open ".$fileOut."\n";
if ($fileSource =~ /\.gz$/) {
	open(FILE_SOURCE, "gunzip -c $fileSource |") or die "Cannot open ".$fileSource."\n";
} else {
	open(FILE_SOURCE, $fileSource) or die "Cannot open ".$fileSource."\n";
}
while (<FILE_SOURCE>) {
	chomp;
	if ($row < $sourceHeaderRowCount) {
		print FILE_OUT $_."\n";
		$row++;
	} else {
		my @fields = split;
		if ($sourceIdColumn < @fields) {
			my @id_array = split /:/, $fields[$sourceIdColumn];
			$inIdList = exists $idList{$id_array[0]};
			if (($inIdList && !$remove) || (!$inIdList && $remove)) {
				print FILE_OUT $_."\n";
				#print $_."\n";
			}
		} else {
			die "Invalid --id_column\n";
		}
	}
}
close FILE_SOURCE;
close FILE_OUT;

print "Done\n";