# Final Project Workbook
### BIOF 309 Spring 2020 

**Author:** Ramita Karra <br>
**Last edited:** 04-23-2020

In [1]:
pwd

'/Users/dewanr2/Documents/GitHub/project_spring_2020/project_spring_2020'

### Initialize and set up package files

#### Create new a directory and directory structure for the package (slightly modified code from class)

In [1]:
%%writefile initialize.py

from pathlib import Path
import os
import shutil 

def create_package_dir(package_name='BIOF309_RDK'):
    start_dir = Path.cwd()
    print(f"Starting in {start_dir}")

    if start_dir.name == package_name:
        os.chdir(start_dir.parent)
        package_dir = start_dir
    else:
        package_dir = start_dir / package_name
    
    if package_dir.exists():
        print("Removing old directory...")
        shutil.rmtree(package_dir)

    print(f"Creating {package_dir}...")
    package_dir.mkdir()
    print(f"The current working directory is now {package_dir}")
    os.chdir(package_dir)
    
def create_package_str(package_name='EHT_RDK'):
    Path('tests').mkdir()
    python_dir = Path(package_name)
    python_dir.mkdir()
    (python_dir / '__init__.py').touch()
    Path('setup.py').touch()
    Path('LICENSE').touch()
    Path('README.md').touch()

Writing initialize.py


In [2]:
from initialize import create_package_dir, create_package_str

create_package_dir('BIOF309_RDK')
create_package_str('EHT_RDK')

Starting in /Users/dewanr2/Documents/GitHub/project_spring_2020/project_spring_2020
Creating /Users/dewanr2/Documents/GitHub/project_spring_2020/project_spring_2020/BIOF309_RDK...
The current working directory is now /Users/dewanr2/Documents/GitHub/project_spring_2020/project_spring_2020/BIOF309_RDK


#### Add metadata and installation details (slightly modified code from class)

In [3]:
%%writefile setup.py

import setuptools

with open("README.md", "r") as fh:
    long_description = fh.read()

setuptools.setup(
    name="EHT_RDK", 
    version="0.0.1",
    author="Ramita D. Karra",
    author_email="ramita.karra@nih.gov",
    description="A package for processing Expansion Hunter Targeted (EHT) repeat data",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/pypa/packaging_demo",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    python_requires='>=3.6',

)

Overwriting setup.py


#### Modify README

In [4]:
%%writefile README.md

# EHT_RDK
## Package Description
<br>
This aim of this package is to process raw tab-delimited output returned from the ExpansionHunter-Targeted 
software tool, used for making sequence-graph-based predictions of repeat lengths for known genetic repeat loci. 
The ultimate goal is to clean, compile, and process data for many different loci into a summary table, and to 
provide visualizations pertinent to the functional relevance of this data (i.e. number of samples containing 
repeat numbers above the pathogenic threshold for each gene).  

More information on ExpansionHunter can be found [here](https://academic.oup.com/bioinformatics/article/35/22/4754/5499079). 

Overwriting README.md


#### Write license (from class)

In [5]:
%%writefile LICENSE

Copyright (c) 2018 The Python Packaging Authority

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Overwriting LICENSE


### Input files

#### Create a list of file paths, consisting of all files in the directory containing raw data

In [7]:
%%writefile input_files.py

import os
import glob
import pandas as pd

def create_input_list(directory_name):
    file_list = []

    # Check to make sure that only '.txt' files are being appended to list
    for filename in os.listdir(directory_name):
        if filename.endswith('.txt'):
            file_list.append(filename)
        else:
            print('Found non .txt file in directory: ' + filename)
            
    return(file_list)

Overwriting input_files.py


In [8]:
from input_files import create_input_list

# Pass directory containing raw ExpansionHunter - Targeted data
files_to_import = create_input_list('/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted')

In [9]:
display(files_to_import)

['ExpansionHunterTargeted.ftd.ATXN7.GCC.txt',
 'ExpansionHunterTargeted.ftd.ATXN7.GCA.txt',
 'ExpansionHunterTargeted.ftd.TCF4.txt',
 'ExpansionHunterTargeted.ftd.CACNA1A.txt',
 'ExpansionHunterTargeted.ftd.HTT.CAG.txt',
 'ExpansionHunterTargeted.ftd.PPP2R2B.txt',
 'ExpansionHunterTargeted.ftd.CNBP.CAGG.txt',
 'ExpansionHunterTargeted.ftd.JPH3.txt',
 'ExpansionHunterTargeted.ftd.FXN.GAA.txt',
 'ExpansionHunterTargeted.ftd.CBL.txt',
 'ExpansionHunterTargeted.ftd.CSTB.txt',
 'ExpansionHunterTargeted.ftd.DIP2B.txt',
 'ExpansionHunterTargeted.ftd.NOP56.CGCCTG.txt',
 'ExpansionHunterTargeted.ftd.CNBP.CAGA.txt',
 'ExpansionHunterTargeted.ftd.AR.txt',
 'ExpansionHunterTargeted.ftd.TBP.txt',
 'ExpansionHunterTargeted.ftd.HTT.CCG.txt',
 'ExpansionHunterTargeted.ftd.DMPK.txt',
 'ExpansionHunterTargeted.ftd.FMR1.txt',
 'ExpansionHunterTargeted.ftd.ATXN8OS.CTG.txt',
 'ExpansionHunterTargeted.ftd.NOP56.GGCCTG.txt',
 'ExpansionHunterTargeted.ftd.ATXN8OS.CTA.txt',
 'ExpansionHunterTargeted.ftd.ATXN10

#### EXPLORATORY DATA ANALYSIS

In [25]:
# Change into directory containing raw data

directory_name = '/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'
os.chdir(directory_name)

In [26]:
pwd

'/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'

In [28]:
# Import first file in file_list 
test_df = pd.read_csv(file_list[0], sep='\t')

# Examine columns
print(test_df.columns)

Index(['Cohort', 'SampleID', 'chr', 'pos', 'INFO',
       'GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC', 'REPCN:ATXN7_GCC_allele1',
       'REPCN:ATXN7_GCC_allele2'],
      dtype='object')


In [29]:
# Create 'min' and 'max' columns for minimum and maximum allele repeat values respectively
test_df['min'] = test_df[['REPCN:ATXN7_GCC_allele1','REPCN:ATXN7_GCC_allele2']].min(axis=1)
test_df['max'] = test_df[['REPCN:ATXN7_GCC_allele1','REPCN:ATXN7_GCC_allele2']].max(axis=1)

# Select only desired columns
test_df = test_df[['SampleID','chr','pos','min','max']]

print(test_df.columns)
print(test_df.info())

Index(['SampleID', 'chr', 'pos', 'min', 'max'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938 entries, 0 to 1937
Data columns (total 5 columns):
SampleID    1938 non-null object
chr         1938 non-null object
pos         1938 non-null int64
min         1938 non-null int64
max         1938 non-null int64
dtypes: int64(3), object(2)
memory usage: 75.8+ KB
None


*EDA suggests that when importing dataframes, need to apply the following:*
- evaluate data to create 'min' and 'max' columns
- rename 'max' column with gene name
- select only desired columns: 'SampleID','chr','pos','max'

*EDA did not show any null entries for this df, but need to import with default null value in case all genes were not evaluated for all samples*

#### Import all files from list of filepaths created earlier

In [30]:
# Confirm that current directory contains raw data

pwd

'/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'

In [63]:
# Create list of dataframes with columns: 'SampleID','chr','pos','min','max'

df_list = []

for filename in file_list:
    
    # Get gene name
    if filename.startswith("ExpansionHunterTargeted.ftd"):
        gene = filename.replace("ExpansionHunterTargeted.ftd.","", 1)
        if gene.endswith(".txt"):
            gene = gene.replace(".txt","",1)
        else:
            print("filename does not contain suffix")
    else:
        print("filename does not contain prefix")
    
    # Import and format df
    df_temp = pd.read_csv(filename, sep='\t')
    df_temp.rename(columns={ df_temp.columns[6]: "allele1" }, inplace = True)
    df_temp.rename(columns={ df_temp.columns[7]: "allele2" }, inplace = True)
    df_temp['min'] = df_temp[['allele1','allele2']].min(axis=1)
    df_temp[gene] = df_temp[['allele1','allele2']].max(axis=1)
    df_temp = df_temp[['SampleID','chr','pos',gene]]
    df_list.append(df_temp)

In [65]:
display(df_list[0].head())

Unnamed: 0,SampleID,chr,pos,ATXN7.GCC
0,RES04914,chr3,63912714,16
1,RES08323,chr3,63912714,15
2,RES04107,chr3,63912714,14
3,RES05106,chr3,63912714,13
4,RES04513,chr3,63912714,13


### Create dictionary of gene info

In [70]:
# Create dictionary with keys as genes, subdictionaries as key:value pairs for "chr", "pos"

gene_dict = {}

for df in df_list:
    # Get gene name
    gene = df.columns[3]
    
    # Get chromosome
    chr_num = df.iloc[0,1]
    chr_num = chr_num.replace("chr","",1) # Remove "chr" prefix
    
    # Get gene position
    pos = df.iloc[0,2]
    
    # Edit dictionary
    gene_dict.update( {gene : {"chr":chr_num, "pos":pos}} )a

In [73]:
display(gene_dict['C9ORF72'])

{'chr': '9', 'pos': 27573528}

### Merge gene-specific dataframes into master dataframe