# Plotting MapQ Distribution in Genome Graphs Constructed with VG
Here we are looking to see what the majority of MapQ scores are for reads mapped to the graph contructed using Deletion calls only, Insertion calls only, and both insertions and deletions as called in Sniffles. Read mapping for all comparisons were completed with `vg map`. 

To start, we import `numpy`, `matplotlib` and `pandas` packages.  

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Then we defined our functions to plot MapQ scores `plot_from_file()`, which is looking at the distribution of all MapQ scores. We are also defining `closer_look()`, which is taking a closer look at the number of reads with a MapQ of 0 and 60.  

In [None]:
def plot_from_file(file_path, title, xlabel, ylabel):
    # Read data from the file
    data = pd.read_csv(file_path, sep = '\t')
    
    plt.bar(data['MapQ'], data['count'], color='green')
    
    coefficients = np.polyfit(data.index, data['count'], 1)
    trendline = np.polyval(coefficients, data.index)
    
    plt.plot(data['MapQ'], trendline, color='red', linestyle = '--', label = 'Trendline')
    
    # Title and labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()

    # Show the plot
    plt.show()

def closer_look(file_path, title, xlabel, ylabel):
    data = pd.read_csv(file_path, sep = '\t')
    
    first_value = data.loc[data.index[0], 'count']
    last_value = data.loc[data.index[-1], 'count']
    
    plt.bar(['Lowest', 'Highest'], [first_value, last_value], color = ['blue', 'green'])
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.ylim((data.loc[data.index[0], 'count'] - 50000), (data.loc[data.index[-1], 'count'] + 50000))
    
    plt.show()

This first plot is looking at reads mapped to the graph constructed with Jane's genome, before it is augmented with SVs. 

In [None]:
if __name__ == "__main__":
    # Specify the path to your data file
    Jane = "/scale_wlg_nobackup/filesets/nobackup/uc03718/graphs/Jane_as_graph/score_distribution.tsv"

    # Plotting
    plot_from_file(Jane, 'Jane as Graph MapQ Score Distribution', 'MapQ', 'Counts')

In [None]:
if __name__ == "__main__":
    closer_look(Jane, 'Jane as Graph Low and High MapQ Comparison', 'MapQ', 'Counts')

In [None]:
Jane = pd.read_csv(Jane, sep = '\t')
Jane_diff = Jane.loc[Jane.index[0], 'count'] - Jane.loc[Jane.index[-1], 'count']
Jane_total = pd.read_csv("/path/to/Ariki_chr7_Jane_as_graph_jqOutput.tsv", sep = '\t')
Jane_total.fillna(0, inplace=True)
Jane_mean = Jane_total.iloc[:, 1].mean()

print("The number of reads with a MapQ of 0 is " + str(Jane.loc[Jane.index[0], 'count']))
print("The number of reads with a MapQ of 60 is " + str(Jane.loc[Jane.index[-1], 'count']))
print("The difference in the # of reads with MapQ 0 - # reads with MapQ 60 is " + str(Jane_diff) + ", while the mean MapQ is " + str(np.round(Jane_mean, decimals = 4)))

In [None]:
if __name__ == "__main__":
    DEL = "/scale_wlg_nobackup/filesets/nobackup/uc03718/graphs/DEL/vg_maps/score_distribution.tsv"
    plot_from_file(DEL, 'Deletion MapQ Score Distribution', 'MapQ', 'Counts')

In [None]:
if __name__ == "__main__":
    closer_look(DEL, 'Deletion Graph Low and High MapQ Comparison', 'MapQ', 'Counts')

In [None]:
DEL = pd.read_csv(DEL, sep = '\t')
DEL_diff = DEL.loc[DEL.index[0], 'count'] - DEL.loc[DEL.index[-1], 'count']
DEL_total = pd.read_csv("/path/to/Ariki_chr7_Jane_as_graph_jqOutput.tsv", sep = '\t')
DEL_total.fillna(0, inplace=True)
DEL_mean = DEL_total.iloc[:, 1].mean()

print("The number of reads with a MapQ of 0 is " + str(DEL.loc[DEL.index[0], 'count']))
print("The number of reads with a MapQ of 60 is " + str(DEL.loc[DEL.index[-1], 'count']))
print("The difference in the # of reads with MapQ 0 - # reads with MapQ 60 is " + str(DEL_diff) + ", while the mean MapQ is " + str(np.round(DEL_mean, decimals = 4)))

In [None]:
if __name__ == "__main__":
    # Specify the path to your data file
    INS = "/scale_wlg_nobackup/filesets/nobackup/uc03718/graphs/INS/vg_maps/score_distribution.tsv"

    # Plotting
    plot_from_file(INS, 'Deletion MapQ Score Distribution', 'MapQ', 'Counts')

In [None]:
if __name__ == "__main__":
    closer_look(INS, 'Deletion Graph Low and High MapQ Comparison', 'MapQ', 'Counts')

In [None]:
INS = pd.read_csv(INS, sep = '\t')
INS_diff = INS.loc[INS.index[0], 'count'] - INS.loc[INS.index[-1], 'count']
INS_total = pd.read_csv("/path/to/Ariki_chr7_Jane_as_graph_jqOutput.tsv", sep = '\t')
INS_total.fillna(0, inplace=True)
INS_mean = INS_total.iloc[:, 1].mean()

print("The number of reads with a MapQ of 0 is " + str(INS.loc[INS.index[0], 'count']))
print("The number of reads with a MapQ of 60 is " + str(INS.loc[INS.index[-1], 'count']))
print("The difference in the # of reads with MapQ 0 - # reads with MapQ 60 is " + str(INS_diff) + ", while the mean MapQ is " + str(np.round(INS_mean, decimals = 4)))

In [None]:
if __name__ == "__main__":
    # Specify the path to your data file
    INDEL = "/scale_wlg_nobackup/filesets/nobackup/uc03718/graphs/INDEL/vg_maps/score_distribution.tsv"

    # Plotting
    plot_from_file(INDEL, 'Deletion MapQ Score Distribution', 'MapQ', 'Counts')

In [None]:
if __name__ == "__main__":
    closer_look(INDEL, 'Deletion Graph Low and High MapQ Comparison', 'MapQ', 'Counts')

In [None]:
INDEL = pd.read_csv(INDEL, sep = '\t')
INDEL_diff = INDEL.loc[INDEL.index[0], 'count'] - INDEL.loc[INDEL.index[-1], 'count']
INDEL_total = pd.read_csv("/path/to/Ariki_chr7_Jane_as_graph_jqOutput.tsv", sep = '\t')
INDEL_total.fillna(0, inplace=True)
INDEL_mean = INDEL_total.iloc[:, 1].mean()

print("The number of reads with a MapQ of 0 is " + str(INDEL.loc[INDEL.index[0], 'count']))
print("The number of reads with a MapQ of 60 is " + str(INDEL.loc[INDEL.index[-1], 'count']))
print("The difference in the # of reads with MapQ 0 - # reads with MapQ 60 is " + str(INDEL_diff) + ", while the mean MapQ is " + str(np.round(INDEL_mean, decimals = 4)))

## Quick Summary

In [None]:
print("Jane's graph difference: " + str(Jane_diff) + " Jane's graph mean: " + str(np.round(Jane_mean, decimals = 4)))
print("DEL graph: " + str(DEL_diff) + " DEL graph mean: " + str(np.round(DEL_mean, decimals = 4)))
print("INS graph: " + str(INS_diff) + " INS graph mean: " + str(np.round(INS_mean, decimals = 4)))
print("INDEL graph: " + str(INDEL_diff) + " INDEL graph mean: " + str(np.round(INDEL_mean, decimals = 4)))

total_plt = pd.DataFrame({'Mean MapQ': [Jane_mean, DEL_mean, INS_mean, INDEL_mean], 
                          'MapQ Difference' : [Jane_diff, DEL_diff, INS_diff, INDEL_diff], 
                          'MapQ 0' : [Jane.loc[Jane.index[0], 'count'], DEL.loc[DEL.index[0], 'count'], INS.loc[INS.index[0], 'count'], INDEL.loc[INDEL.index[0], 'count']], 
                          'MapQ 60' : [Jane.loc[Jane.index[-1], 'count'], DEL.loc[DEL.index[-1], 'count'], INS.loc[INS.index[-1], 'count'], INDEL.loc[INDEL.index[-1], 'count']],
                          'Data': ['Jane as Graph', 'Deletions Only', 'Insertions Only', 'INDEL Graph']})

plt.bar(total_plt['Data'], total_plt['Mean MapQ'], color = 'green')
plt.title('Mean MapQ by Graph')
plt.xlabel('Graph Construction Type')
plt.ylabel('Mean MapQ')
plt.ylim(29.78, 29.82)
plt.show()

plt.bar(total_plt['Data'], total_plt['MapQ 0'], color = 'red')
plt.title('Number of Reads with MapQ of 0')
plt.xlabel('Graph Construction Type')
plt.ylabel('Read Count')
plt.ylim(2951500, 2953000)
plt.show()

plt.bar(total_plt['Data'], total_plt['MapQ Difference'], color = 'blue')
plt.title('Difference in Number of Reads with MapQ 0 & MapQ 60')
plt.xlabel('Graph Construction Type')
plt.ylabel('Difference')
plt.ylim(32800, 35000)
plt.show()
