In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [7]:
filename = 'C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\gene_specific_df\\GJB2_combined_with_source_and_position.csv'
data = pd.read_csv(f"{filename}")
data.head(10)
dot_index = filename.index('_')
gene = filename[:dot_index]
if gene == "combined" or gene == "combined_with_source":
    gene = "7 genes"
if gene == "features":
    gene = "190 genes"

data = pd.get_dummies(data, columns=["secondary_structure"])

mapping = {"benign": 0, "pathogenic": 1}
data["pathogenicity"] = data["pathogenicity"].map(mapping)

data = data.drop(
    labels=["source", "uniprot_id", "stability_WT", "stability_MUT", "hydrophobicity_WT", "hydrophobicity_MUT", "volume_WT",
            "volume_MUT", "sequence_length", "oda_MUT", "oda_WT", "sasa_WT", "sasa_MUT", "RSA_MUT", "variant", "gene",
            "protein_contain_transmembrane", "is_residue_transmembranal", "aa_WT", "aa_MUT"], axis=1, inplace=False)

data

Unnamed: 0,stability_delta,pathogenicity,blosum,hydrophobicity_delta,volume_delta,plddt_residue,oda_delta,sasa_delta,RSA_WT,pssm,entropy,position,secondary_structure_Beta strand,secondary_structure_Helix,secondary_structure_Loop,secondary_structure_Turn
0,4.8675,1,-3,-3.6,-54.4,93.72,-0.47,2.03,0.033636,6.426,0.000,77,0,1,0,0
1,-0.0958,1,-3,7.0,-64.9,66.22,-0.11,-106.94,0.712264,4.554,1.874,127,0,0,1,0
2,2.7659,1,-3,-8.0,-52.6,72.33,0.21,3.66,0.022205,4.126,1.426,121,0,0,1,0
3,2.1518,1,-2,-8.3,6.7,92.46,16.80,26.31,0.131414,4.650,0.040,81,0,1,0,0
4,-1.0403,1,0,2.2,28.5,95.06,2.20,20.58,0.258144,4.448,1.984,200,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,-0.6363,0,0,2.2,28.5,67.75,4.18,17.56,0.477423,2.786,2.657,220,0,0,1,0
273,-0.7241,0,-2,-3.5,108.5,67.75,3.70,81.36,0.477423,2.786,2.657,220,0,0,1,0
274,0.1560,0,2,-0.6,4.8,58.02,1.52,40.66,0.513261,3.017,2.023,221,0,0,1,0
275,-1.0018,0,2,-0.6,4.8,45.22,-4.47,21.13,0.752957,4.153,2.317,223,0,0,1,0


In [8]:
# Drop structural features
data = data.drop(labels=["oda_delta", "stability_delta", "secondary_structure_Beta strand", "secondary_structure_Helix", "secondary_structure_Loop", "sasa_delta", "RSA_WT"], axis=1, inplace=False)

In [9]:

# Create list of all positions.
positions = data["position"].unique()
tps = []
fps = []
fns = []
tns = []
errors = []
mistakes = 0

counter = 0

for pos in positions:
    counter += 1
    print(f"Position: {pos} ({counter}/{len(positions)})")
    # Create train and test sets.
    train = data[data["position"] != pos]
    test = data[data["position"] == pos]

    X_test = test.drop(labels=["pathogenicity", "position"], axis=1, inplace=False)
    y_test = test["pathogenicity"]

    # Oversamole the train set using SMOTE.
    X_train = train.drop(labels=["pathogenicity", "position"], axis=1, inplace=False)
    y_train = train["pathogenicity"]
    oversample = SMOTE(sampling_strategy='minority', random_state=42)
    X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)
    # class_distribution = y_train_resampled.value_counts()
    # print(f"Training set: (SMOTE)\n{class_distribution}")

    xgb_classifier = xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, random_state=42)
    xgb_classifier.fit(X_train_resampled, y_train_resampled)  # Fit the model with the resampled data
    y_pred = xgb_classifier.predict(X_test)

    tp = sum((y_test == 1) & (y_pred == 1))
    fp = sum((y_test == 0) & (y_pred == 1))
    fn = sum((y_test == 1) & (y_pred == 0))
    tn = sum((y_test == 0) & (y_pred == 0))

    print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")

    tps.append(tp)
    fps.append(fp)
    fns.append(fn)
    tns.append(tn)

    print(f"tps: {sum(tps)}, fps: {sum(fps)}, fns: {sum(fns)}, tns: {sum(tns)}")

    print(f"Prediction: {y_pred}. Reality: {y_test.values}")
    # # print error if at least one prediction is wrong
    # if not np.array_equal(y_pred, y_test.values):
    #     print(f"Classification is wrong for position {pos}!")
    #     for i in range(len(y_pred)):
    #         if y_pred[i] != y_test.values[i]:
    #             mistakes+=1
    #             print(f"Predicted: {y_pred[i]}, reality: {y_test.values[i]}")

# Calculate MCC
TP = sum(tps)
FP = sum(fps)
FN = sum(fns)
TN = sum(tns)
mcc = (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

print(f"TP: {sum(tps)}, FP: {sum(fps)}, TN: {sum(fns)}, FN: {sum(tns)}")

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
precision = TP / (TP + FP)
accuracy = (TP + TN) / (TP + TN + FP + FN)
print(f"Results for {gene}:")
print(f"Sensitivity (Recall): {sensitivity}")
print(f"Specificity: {specificity}")
print(f"Precision: {precision}")
print(f"MCC for {gene}: {mcc}")
print(f"Accuracy: {accuracy}")

Position: 77 (1/149)
TP: 1, FP: 0, TN: 0, FN: 0
tps: 1, fps: 0, fns: 0, tns: 0
Prediction: [1]. Reality: [1]
Position: 127 (2/149)
TP: 2, FP: 0, TN: 1, FN: 0
tps: 3, fps: 0, fns: 0, tns: 1
Prediction: [1 1 0]. Reality: [1 1 0]
Position: 121 (3/149)
TP: 1, FP: 0, TN: 0, FN: 0
tps: 4, fps: 0, fns: 0, tns: 1
Prediction: [1]. Reality: [1]
Position: 81 (4/149)
TP: 2, FP: 0, TN: 0, FN: 0
tps: 6, fps: 0, fns: 0, tns: 1
Prediction: [1 1]. Reality: [1 1]
Position: 200 (5/149)
TP: 0, FP: 0, TN: 0, FN: 1
tps: 6, fps: 0, fns: 1, tns: 1
Prediction: [0]. Reality: [1]
Position: 46 (6/149)
TP: 2, FP: 0, TN: 0, FN: 0
tps: 8, fps: 0, fns: 1, tns: 1
Prediction: [1 1]. Reality: [1 1]
Position: 45 (7/149)
TP: 2, FP: 0, TN: 0, FN: 0
tps: 10, fps: 0, fns: 1, tns: 1
Prediction: [1 1]. Reality: [1 1]
Position: 82 (8/149)
TP: 2, FP: 0, TN: 0, FN: 0
tps: 12, fps: 0, fns: 1, tns: 1
Prediction: [1 1]. Reality: [1 1]
Position: 151 (9/149)
TP: 1, FP: 0, TN: 0, FN: 0
tps: 13, fps: 0, fns: 1, tns: 1
Prediction: [1]. R

In [10]:
# # # Access feature importances
# feature_importances = xgb_classifier.feature_importances_
# 
# # Calculate the confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# 
# # Get the sensitivity, specificity, FPV and TPV values
# conf_matrix[1, 1] = TP
# conf_matrix[1, 0] = FN
# conf_matrix[0, 0] = TN
# conf_matrix[0, 1] = FP
# 
# # Create a heatmap for visualization
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.title(f'Confusion Matrix for {gene}')
# plt.show()