In [49]:
#!/bin/env python
# Author: Teja Koganti

# 02_calculate_tmb_from_mafdb.py
# This script does the following - 
#  1. Establishes a connection with sqlite 
#  2. Groupby column name "Tumor_Sample_Barcode" and counts the number of var
            # for each sample from table "mutect2"
#  3. Loads metadata file(pbta-histologies for PBTA)   
#  4. Use join to match up sample names and disease types and prints the final DF
#
# Note: requires pandas and sqlite to be installed, and expects python3
# conda install -c anaconda pandas
# conda install -c anaconda sqlite


import pandas as pd
import numpy as np
import sqlite3

# Will use argparse after first PR
metadatafile="/Users/kogantit/Documents/git_repos/d3b-bix-analysis-toolkit/analyses/TMBanalysis/inputs/pbta-histologies.tsv"
disease_col = "short_histology"
samplename_col = "Kids_First_Biospecimen_ID"
db_name="/Users/kogantit/Documents/git_repos/d3b-bix-analysis-toolkit/analyses/TMBanalysis/output/var_db.sqlite"
target_bed_size=77462866    # This will be counted in bash script when it is fully setup 
out_tmb = "/Users/kogantit/Documents/git_repos/d3b-bix-analysis-toolkit/analyses/TMBanalysis/output/pbta-snv-mutect2-tmbscores.txt"


# Establiching a connection to the MAF db 
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print("Error connecting to database")
    return(conn)    


# Creating and empty DF with columns for sample name and TMB
sample_tmb_colnames = ["Samplenames", "TMB"]
sample_tmb_df = pd.DataFrame(columns=sample_tmb_colnames)


# Starting a database connection     
con = create_connection(db_name)
# Using GROUPBY to count vars in each sample from table mutect2
var_db = con.execute('select Tumor_Sample_Barcode,count(Tumor_Sample_Barcode) from mutect2 GROUP BY  Tumor_Sample_Barcode')

# Calculating TMB based on the results from MAF DB
for each_sample in var_db.fetchall():
    samplename = each_sample[0]
    #print(samplename,  (int(each_sample[1])*1000000)/target_bed_size)
    sample_tmb_df = sample_tmb_df.append({"Samplenames" : samplename ,
                                          "TMB" : (int(each_sample[1])*1000000)/target_bed_size} , ignore_index=True)

# Reading in metadata file to extract disease types
metadata = pd.read_csv(metadatafile, sep="\t", index_col=False)


#Checking of all samples within db_name are in the metadata file 
if not (np.in1d(sample_tmb_df["Samplenames"].astype(str), manifest[samplename_col].astype(str))).all():
    print("\n-----Error message: Samples not in metadata file!-----\n")
    sys.exit()
    

# Joining TMB results and metedata file and keeping only relevant columns 
final_tmb = sample_tmb_df.join(manifest.set_index(samplename_col), 
                               on="Samplenames")[["Samplenames",  "TMB", disease_col]]
final_tmb.to_csv(out_tmb, index=False)




2.6.0
