In [39]:
import pandas as pd

In [40]:
genes = ["GJB2", "COL4A5", "COL4A3", "COL2A1", "SLC26A4", "FGFR1", "MYO7A", "WFS1"]

In [41]:
# Combine the prediction files into one file
my_prediction = pd.read_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\predictions_vs_real\\predictions_vs_real_with_variant_all_genes_updated_221123.csv")
my_prediction

Unnamed: 0,variant,prediction,reality,gene
0,M1R,0,0,COL2A1
1,I2A,0,0,COL2A1
2,I2L,0,0,COL2A1
3,I2P,0,0,COL2A1
4,I2T,0,0,COL2A1
...,...,...,...,...
4799,Y217F,0,0,GJB2
4800,G220A,0,0,GJB2
4801,G220K,0,0,GJB2
4802,K221R,0,0,GJB2


In [42]:
# Build a dataframe with all the predictions from AlphaMissense, for all genes in list genes.
am_predictions = pd.DataFrame()
for gene in genes:
    gene_predictions = pd.read_csv(f"C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\benchmarking\\AlphaMissense\\am_predictions_per_gene\\am_predictions_{gene}.csv")
    am_predictions = pd.concat([am_predictions, gene_predictions])
am_predictions

Unnamed: 0,uniprot_id,protein_variant,am_pathogenicity,am_class,gene
0,P29033,M1A,0.8186,pathogenic,GJB2
1,P29033,M1C,0.7324,pathogenic,GJB2
2,P29033,M1D,0.9895,pathogenic,GJB2
3,P29033,M1E,0.9607,pathogenic,GJB2
4,P29033,M1F,0.5237,ambiguous,GJB2
...,...,...,...,...,...
16905,O76024,A890S,0.0789,benign,WFS1
16906,O76024,A890T,0.1216,benign,WFS1
16907,O76024,A890V,0.1489,benign,WFS1
16908,O76024,A890W,0.7340,pathogenic,WFS1


In [43]:
# Change column names to match the other file
am_predictions = am_predictions.rename(columns={"protein_variant": "variant"})

In [44]:
# print all rows with Nan in EVE score
am_vus = am_predictions[am_predictions['am_class'] == "ambiguous"]

In [45]:
# No need for applying threshold to VUS, since EVE have pre-deleted all VUS.
# Just delete all rows with Nan in EVE score
am_predictions = am_predictions[~(am_predictions['am_class'] == "ambiguous")]
am_predictions

Unnamed: 0,uniprot_id,variant,am_pathogenicity,am_class,gene
0,P29033,M1A,0.8186,pathogenic,GJB2
1,P29033,M1C,0.7324,pathogenic,GJB2
2,P29033,M1D,0.9895,pathogenic,GJB2
3,P29033,M1E,0.9607,pathogenic,GJB2
5,P29033,M1G,0.9077,pathogenic,GJB2
...,...,...,...,...,...
16904,O76024,A890R,0.3313,benign,WFS1
16905,O76024,A890S,0.0789,benign,WFS1
16906,O76024,A890T,0.1216,benign,WFS1
16907,O76024,A890V,0.1489,benign,WFS1


In [46]:
# Make "benign" and "pathogenic" to 0 and 1
am_predictions.loc[am_predictions['am_class'] == "benign", 'am_class'] = 0
am_predictions.loc[am_predictions['am_class'] == "pathogenic", 'am_class'] = 1

In [47]:
# print how many pathogenic and benign variants are in the EVE file
print(f"Number of pathogenic variants in am: {len(am_predictions[am_predictions['am_class'] == 1])}")
print(f"Number of benign variants in am: {len(am_predictions[am_predictions['am_class'] == 0])}")

Number of pathogenic variants in am: 93802
Number of benign variants in am: 70716


In [48]:
# Merge the two files
merged = pd.merge(my_prediction, am_predictions, on=["gene", "variant"])
merged


Unnamed: 0,variant,prediction,reality,gene,uniprot_id,am_pathogenicity,am_class
0,M1R,0,0,COL2A1,P02458,0.1676,0
1,I2A,0,0,COL2A1,P02458,0.1500,0
2,I2L,0,0,COL2A1,P02458,0.0908,0
3,I2P,0,0,COL2A1,P02458,0.1957,0
4,I2T,0,0,COL2A1,P02458,0.2038,0
...,...,...,...,...,...,...,...
4495,Y217F,0,0,GJB2,P29033,0.0702,0
4496,G220A,0,0,GJB2,P29033,0.0855,0
4497,G220K,0,0,GJB2,P29033,0.0955,0
4498,K221R,0,0,GJB2,P29033,0.0586,0


In [49]:
# To make sure that all tools will be tested on the same variants, save the merged with REVEL file and do so for all other tools. Then merge all the files together.
merged_only_am_columns = merged[['gene', 'variant', 'am_pathogenicity', 'am_class']]
merged.to_csv("C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\benchmarking\\alphamissense_on_dvd_data_predictions_LOPO.csv")

In [50]:
print(f"length of merged {len(merged)}")
print(f"length of my_prediction {len(my_prediction)}")
print(f"length of alpha_missense {len(am_predictions)}")

length of merged 4500
length of my_prediction 4804
length of alpha_missense 164518


In [51]:
#############################################################################################################
# All next rows were used to calculate the MCC, before realised I must apply first the threshold and remove the VUS variants. Now all these rows aren't in use, since I'm doing the calculations after removing the VUS variants and with all tools together.
#############################################################################################################

In [52]:
# ## Calculate the MCC for the mutpred predictions, according to the real pathogenicity (0 or 1)
# from sklearn.metrics import matthews_corrcoef

In [53]:
# # separate the merged dataframe according to gene
# genes = merged["gene"].unique()
# print(f"Number of genes: {len(genes)}")

In [54]:
# # Calculate MCC for each gene specific predictor for EVE
# genes = merged_without_nan["gene"].unique()
# print(genes)
# 
# ### EVE ####
# # Build dictionary with gene names as keys.
# mccs = {gene: 0 for gene in genes}
# 
# for gene in genes:
#     gene_df = merged_without_nan[merged_without_nan["gene"] == gene]
#     # Assuming you have a DataFrame called 'data' with 'prediction' and 'MutPred_score' columns
#     gene_df['EVE_scores_ASM'] = gene_df['EVE_scores_ASM'].astype(float)
#     # Create binary predictions based on the 0.5 threshold
#     gene_df.loc[:, 'binary_prediction_EVE'] = (gene_df['EVE_scores_ASM'] > 0.5).astype(int)
# 
#     gene_df['predictions'] = gene_df['predictions'].astype(int)
# 
#     # Calculate MCC
#     mcc = matthews_corrcoef(gene_df['predictions'], gene_df['binary_prediction_EVE'])
# 
#     # Get gene name for the use for the dictionary
#     gene = gene_df['gene'].unique()[0]
# 
#     # Append mcc to dictionary
#     mccs[gene] = mcc
# 
# print("MCCs of EVE predictions for each gene:")
# for gene in mccs:
#     print(f"{gene}: {mccs[gene]}")

In [55]:
# from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score, f1_score, log_loss, confusion_matrix
# # store the results in a dictionary
# results = {gene: 0 for gene in genes}
# 
# for gene in genes:
#     gene_df = merged_without_nan[merged_without_nan["gene"] == gene]
#     gene_df['EVE_scores_ASM'] = gene_df['EVE_scores_ASM'].astype(float)
#     y_true = gene_df["pathogenicity"].astype(int)
#     predicted_probabilities = gene_df["EVE_scores_ASM"]
#     gene_df.loc[:, 'binary_prediction_EVE'] = (gene_df['EVE_scores_ASM'] > 0.5).astype(int)
#     predicted_labels = gene_df["binary_prediction_EVE"]
#     
#     results_gene = {}
#     # Calculate AUC-ROC
#     roc_auc = roc_auc_score(y_true, predicted_probabilities)
#     # append to dictionary
#     results_gene["roc_auc"] = roc_auc
#     
#     # Calculate ROC curve
#     fpr, tpr, thresholds = roc_curve(y_true, predicted_probabilities)
#     roc_auc = auc(fpr, tpr)
#     # append to dictionary
#     results_gene["auc"] = roc_auc
#     
#     # Calculate AUC-PR
#     precision, recall, _ = precision_recall_curve(y_true, predicted_probabilities)
#     pr_auc = auc(recall, precision)
#     # append to dictionary
#     results_gene["pr_auc"] = pr_auc
#     
#     # Calculate F1 Score
#     f1 = f1_score(y_true, predicted_labels)
#     # append to dictionary
#     results_gene["f1"] = f1
#     
#     # Calculate Log Loss
#     logloss = log_loss(y_true, predicted_probabilities)
#     # append to dictionary
#     results_gene["logloss"] = logloss
#     
#     # Calculate confusion matrix
#     conf_matrix = confusion_matrix(y_true, predicted_labels)
#     # append to dictionary
#     results_gene["confusion_matrix"] = conf_matrix
#     
#     # Get gene name for the use for the dictionary
#     gene = gene_df['gene'].unique()[0]
#     # Append results to dictionary
#     results[gene] = results_gene
#     
# ## Print results
# print("Results of EVE predictions for each gene:")
# for gene in results:
#     print(f"{gene}: {results[gene]}")

In [56]:
# # present the confusion matrix for each gene
# import matplotlib.pyplot as plt
# 
# for gene in results:
#     # Plot confusion matrix
#     plt.imshow(results[gene]["confusion_matrix"], cmap=plt.cm.Blues)
#     plt.xlabel("Predicted labels")
#     plt.ylabel("True labels")
#     plt.xticks([0, 1], ["Benign", "Pathogenic"])
#     plt.yticks([0, 1], ["Benign", "Pathogenic"])
#     plt.title(f"Confusion matrix for {gene}, EVE predictions")
#     plt.style.use("seaborn-white")
#     # Add text annotations
#     for i in range(2):
#         for j in range(2):
#             plt.text(j, i, results[gene]["confusion_matrix"][i, j], ha="center", va="center", color="black", backgroundcolor="white")
#     plt.colorbar()
#     plt.show()

In [57]:
# present the ROC curve for each gene
# for gene in results:
#     # Plot ROC curve
#     plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {results[gene]['roc_auc']:.2f})")
#     plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.xlabel("False Positive Rate")
#     plt.ylabel("True Positive Rate")
#     plt.title(f"ROC curve for {gene}, MutPred2 predictions")
#     plt.legend(loc="lower right")
#     plt.show()