In [20]:
import pandas as pd

In [21]:
# Load VEST4 predictions
VEST4_predictions = pd.read_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\benchmarking\\VEST4\\all_vest4_predictions.csv")
VEST4_predictions = VEST4_predictions.drop(columns=["S.O. transcript", "Amino acid position"])

# rename columns
# Change name of column "HUGO symbol" to "gene"
VEST4_predictions = VEST4_predictions.rename(columns={"HUGO symbol": "gene"})
# Change name of column "Protein sequence change" to "variant"
VEST4_predictions = VEST4_predictions.rename(columns={"Protein sequence change": "variant"})
# Change name of column "VEST4 score (missense)" to "VEST4_score"
VEST4_predictions = VEST4_predictions.rename(columns={"VEST score (missense)": "VEST_score"})
VEST4_predictions

Unnamed: 0,variant,gene,VEST_score
0,G6S,SLC26A4,0.104
1,M21V,SLC26A4,0.037
2,A51T,SLC26A4,0.188
3,C53G,SLC26A4,0.062
4,V163I,SLC26A4,0.725
...,...,...,...
4209,V1669I,MYO7A,0.345
4210,R816H,MYO7A,0.322
4211,P1724A,MYO7A,0.277
4212,I1157V,MYO7A,0.361


In [22]:
# Remove VUS variants
VEST4_predictions["VEST_score"] = pd.to_numeric(VEST4_predictions['VEST_score'], errors='coerce')
# Remove rows with revel score: 0.29 < revel score < 0.644
VEST4_VUS = VEST4_predictions[(VEST4_predictions["VEST_score"] < 0.764) & (VEST4_predictions["VEST_score"] >= 0.449)]
VEST4_predictions = VEST4_predictions[(VEST4_predictions["VEST_score"] >= 0.764) | (VEST4_predictions["VEST_score"] <= 0.449)]
VEST4_predictions

Unnamed: 0,variant,gene,VEST_score
0,G6S,SLC26A4,0.104
1,M21V,SLC26A4,0.037
2,A51T,SLC26A4,0.188
3,C53G,SLC26A4,0.062
5,A180T,SLC26A4,0.177
...,...,...,...
4208,Y235D,MYO7A,0.912
4209,V1669I,MYO7A,0.345
4210,R816H,MYO7A,0.322
4211,P1724A,MYO7A,0.277


In [23]:
# Add benign or pathogenic label to each variant, according to the VEST4 score.
VEST4_predictions["VEST_pathogenicity"] = VEST4_predictions["VEST_score"].apply(lambda x: 1 if x >= 0.764 else 0)
VEST4_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VEST4_predictions["VEST_pathogenicity"] = VEST4_predictions["VEST_score"].apply(lambda x: 1 if x >= 0.764 else 0)


Unnamed: 0,variant,gene,VEST_score,VEST_pathogenicity
0,G6S,SLC26A4,0.104,0
1,M21V,SLC26A4,0.037,0
2,A51T,SLC26A4,0.188,0
3,C53G,SLC26A4,0.062,0
5,A180T,SLC26A4,0.177,0
...,...,...,...,...
4208,Y235D,MYO7A,0.912,1
4209,V1669I,MYO7A,0.345,0
4210,R816H,MYO7A,0.322,0
4211,P1724A,MYO7A,0.277,0


In [24]:
# print how many pathogenic and benign variants are in the EVE file
print(f"Number of pathogenic variants in VEST4: {len(VEST4_predictions[VEST4_predictions['VEST_pathogenicity'] == 1])}")
print(f"Number of benign variants in MutPred: {len(VEST4_predictions[VEST4_predictions['VEST_pathogenicity'] == 0])}")

Number of pathogenic variants in VEST4: 1616
Number of benign variants in MutPred: 1847


In [25]:
# print all the rows with nan
VEST4_predictions[VEST4_predictions.isna().any(axis=1)]
### None of the rows have nan values.

Unnamed: 0,variant,gene,VEST_score,VEST_pathogenicity


In [26]:
# Save VEST4_predictions to csv
VEST4_predictions.to_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\benchmarking\\VEST4\\VEST4_predictions_no_VUS.csv")

In [27]:
print(f"length of revel without VUS {len(VEST4_predictions)}")
print(f"length of revel VUS {len(VEST4_VUS)}")

length of revel without VUS 3463
length of revel VUS 750


In [28]:
# Combine the prediction files into one file
my_prediction = pd.read_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\predictions_vs_real\\all_inbar_predictions.csv")
my_prediction

Unnamed: 0,position,pathogenicity,predictions,variant,gene
0,6,0,0,G6S,SLC26A4
1,21,0,1,M21V,SLC26A4
2,51,0,0,A51T,SLC26A4
3,53,0,1,C53G,SLC26A4
4,163,0,1,V163I,SLC26A4
...,...,...,...,...,...
4516,1045,1,1,G1045V,COL4A3
4517,871,1,1,G871C,COL4A3
4518,853,1,1,G853R,COL4A3
4519,25,0,0,P25S,COL4A3


In [29]:
# Merge the two files
merged = pd.merge(my_prediction, VEST4_predictions, on=["gene", "variant"])
merged

Unnamed: 0,position,pathogenicity,predictions,variant,gene,VEST_score,VEST_pathogenicity
0,6,0,0,G6S,SLC26A4,0.104,0
1,21,0,1,M21V,SLC26A4,0.037,0
2,51,0,0,A51T,SLC26A4,0.188,0
3,53,0,1,C53G,SLC26A4,0.062,0
4,180,0,1,A180T,SLC26A4,0.177,0
...,...,...,...,...,...,...,...
3488,1045,1,1,G1045V,COL4A3,0.986,1
3489,871,1,1,G871C,COL4A3,0.968,1
3490,853,1,1,G853R,COL4A3,0.979,1
3491,25,0,0,P25S,COL4A3,0.101,0


In [30]:
# To make sure that all tools will be tested on the same variants, save the merged with REVEL file and do so for all other tools. Then merge all the files together.
merged_only_VEST4_columns = merged[['gene', 'variant', 'VEST_score', 'VEST_pathogenicity']]
merged_only_VEST4_columns.to_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\benchmarking\\VEST_on_dvd_data_predictions.csv")

In [31]:
print(f"length of merged {len(merged)}")
print(f"length of my_prediction {len(my_prediction)}")
print(f"length of vest4 {len(VEST4_predictions)}")

length of merged 3493
length of my_prediction 4521
length of vest4 3463


In [13]:
# # Save the merged file to csv
# merged.to_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\predictions_vs_real\\merged_my_prediction_and_mutpred.csv", index=False)

In [14]:
#############################################################################################################
# All next rows were used to calculate the MCC, before realised I must apply first the threshold and remove the VUS variants. Now all these rows aren't in use, since I'm doing the calculations after removing the VUS variants and with all tools together.
#############################################################################################################

In [15]:
## Calculate the MCC for the mutpred predictions, according to the real pathogenicity (0 or 1)
from sklearn.metrics import matthews_corrcoef

In [16]:
# separate the merged dataframe according to gene
genes = merged["gene"].unique()
print(f"Number of genes: {len(genes)}")

Number of genes: 6


In [17]:
# Calculate MCC for each gene specific predictor for mutpred

# Build dictionary with gene names as keys.
mccs = {gene: 0 for gene in genes}

merged['VEST_score'] = merged['VEST_score'].astype(float)

for gene in genes:
    gene_df = merged[merged["gene"] == gene]
    # change VEST score to float
    # Assuming you have a DataFrame called 'data' with 'prediction' and 'MutPred_score' columns
    # Create binary predictions based on the 0.5 threshold
    gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df['VEST_score'] > 0.764).astype(int)
    
    gene_df['pathogenicity'] = gene_df['pathogenicity'].astype(int)
    
    # Calculate MCC
    mcc = matthews_corrcoef(gene_df['pathogenicity'], gene_df['binary_prediction_VEST4'])
    
    # Get gene name for the use for the dictionary
    gene = gene_df['gene'].unique()[0]
    
    # Append mcc to dictionary
    mccs[gene] = mcc

print("MCCs of VEST4 predictions for each gene:")
for gene in mccs:
    print(f"{gene}: {mccs[gene]}")

MCCs of VEST4 predictions for each gene:
SLC26A4: 0.8877453966975621
FGFR1: 0.7315749764637576
COL2A1: 0.8896864408552987
COL4A5: 0.9578736572324906
MYO7A: 0.5764098969871279
COL4A3: 0.8042819314699176


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df['VEST_score'] > 0.764).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df['pathogenicity'] = gene_df['pathogenicity'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df[

In [18]:
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score, f1_score, log_loss, confusion_matrix
# store the results in a dictionary
results = {gene: 0 for gene in genes}

for gene in genes:
    gene_df = merged[merged["gene"] == gene]
    y_true = gene_df["pathogenicity"].astype(int)
    predicted_probabilities = gene_df["VEST_score"]
    gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df['VEST_score'] > 0.5).astype(int)
    predicted_labels = gene_df["binary_prediction_VEST4"]
    
    results_gene = {}
    # Calculate AUC-ROC
    roc_auc = roc_auc_score(y_true, predicted_probabilities)
    # append to dictionary
    results_gene["roc_auc"] = roc_auc
    
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, predicted_probabilities)
    roc_auc = auc(fpr, tpr)
    # append to dictionary
    results_gene["auc"] = roc_auc
    
    # Calculate AUC-PR
    precision, recall, _ = precision_recall_curve(y_true, predicted_probabilities)
    pr_auc = auc(recall, precision)
    # append to dictionary
    results_gene["pr_auc"] = pr_auc
    
    # Calculate F1 Score
    f1 = f1_score(y_true, predicted_labels)
    # append to dictionary
    results_gene["f1"] = f1
    
    # Calculate Log Loss
    logloss = log_loss(y_true, predicted_probabilities)
    # append to dictionary
    results_gene["logloss"] = logloss
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, predicted_labels)
    # append to dictionary
    results_gene["confusion_matrix"] = conf_matrix
    
    # Get gene name for the use for the dictionary
    gene = gene_df['gene'].unique()[0]
    # Append results to dictionary
    results[gene] = results_gene
    
## Print results
print("Results of VEST4 predictions for each gene:")
for gene in results:
    print(f"{gene}: {results[gene]}")

Results of VEST4 predictions for each gene:
SLC26A4: {'roc_auc': 0.9666724376731299, 'auc': 0.9666724376731299, 'pr_auc': 0.9862471167874429, 'f1': 0.9758241758241758, 'logloss': 0.18240817560180683, 'confusion_matrix': array([[ 71,   5],
       [  6, 222]], dtype=int64)}
FGFR1: {'roc_auc': 0.948674297305129, 'auc': 0.948674297305129, 'pr_auc': 0.8777934347036447, 'f1': 0.8172043010752686, 'logloss': 0.47658369472864753, 'confusion_matrix': array([[189,  49],
       [  2, 114]], dtype=int64)}
COL2A1: {'roc_auc': 0.991268115942029, 'auc': 0.991268115942029, 'pr_auc': 0.9821705636525003, 'f1': 0.9280821917808219, 'logloss': 0.2891593764899346, 'confusion_matrix': array([[514,  38],
       [  4, 271]], dtype=int64)}
COL4A5: {'roc_auc': 0.9944981316840613, 'auc': 0.9944981316840613, 'pr_auc': 0.9973909854963336, 'f1': 0.9862327909887358, 'logloss': 0.13979559700576102, 'confusion_matrix': array([[188,   7],
       [  4, 394]], dtype=int64)}
MYO7A: {'roc_auc': 0.9436705931078169, 'auc': 0.9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df['VEST_score'] > 0.5).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df.loc[:, 'binary_prediction_VEST4'] = (gene_df['VEST_score'] > 0.5).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df.loc[:, 'binary_prediction

In [19]:
# present the confusion matrix for each gene
import matplotlib.pyplot as plt

for gene in results:
    # Plot confusion matrix
    plt.imshow(results[gene]["confusion_matrix"], cmap=plt.cm.Blues)
    plt.xlabel("Predicted labels")
    plt.ylabel("True labels")
    plt.xticks([0, 1], ["Benign", "Pathogenic"])
    plt.yticks([0, 1], ["Benign", "Pathogenic"])
    plt.title(f"Confusion matrix for {gene}, VEST4 predictions")
    plt.style.use("seaborn-white")
    # Add text annotations
    for i in range(2):
        for j in range(2):
            plt.text(j, i, results[gene]["confusion_matrix"][i, j], ha="center", va="center", color="black", backgroundcolor="white")
    plt.colorbar()
    plt.show()

KeyboardInterrupt: 

In [None]:
#present the ROC curve for each gene
for gene in results:
    # Plot ROC curve
    plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {results[gene]['roc_auc']:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC curve for {gene}, VEST4 predictions")
    plt.legend(loc="lower right")
    plt.show()