In [104]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [105]:
filename = ('C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\gene_specific_df\\SLC26A4_with_position.csv')
data = pd.read_csv(f"{filename}")
data.head(10)
dot_index = filename.index('_')
gene = filename[:dot_index]
if gene == "combined" or gene == "combined_with_source":
    gene = "7 genes"
if gene == "features":
    gene = "190 genes"
    
data

Unnamed: 0,gene,variant,pathogenicity,uniprot_id,stability_WT,stability_MUT,blosum,hydrophobicity_WT,hydrophobicity_MUT,volume_WT,...,oda_MUT,sasa_WT,sasa_MUT,RSA_WT,RSA_MUT,oda_delta,sasa_delta,pssm,entropy,position
0,SLC26A4,G5R,benign,O43511,212.020,211.846,-2,-0.4,-4.5,60.1,...,3.71,76.60,198.67,0.789691,0.749698,3.52,122.07,3.169,2.538,5
1,SLC26A4,G6S,benign,O43511,212.020,212.039,0,-0.4,-0.8,60.1,...,1.69,72.69,108.13,0.749381,0.756154,1.71,35.44,2.239,2.903,6
2,SLC26A4,S8L,benign,O43511,210.619,210.978,-2,-0.8,3.8,89.0,...,-6.48,101.96,149.01,0.713007,0.780157,-7.14,47.05,2.329,3.161,8
3,SLC26A4,S17R,benign,O43511,211.182,212.727,-1,-0.8,-4.5,89.0,...,-15.41,107.43,198.47,0.751259,0.748943,1.78,91.04,2.589,2.106,17
4,SLC26A4,M21V,benign,O43511,211.925,211.821,1,1.9,4.2,162.9,...,-22.06,176.08,127.29,0.867389,0.771455,-6.77,-48.79,-0.901,2.771,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,SLC26A4,S642P,benign,,211.795,213.213,-1,-0.8,-1.6,89.0,...,-1.06,56.04,71.40,0.391888,0.463636,-1.04,15.36,3.696,2.115,642
346,SLC26A4,G740S,benign,,211.894,211.168,0,-0.4,-0.8,60.1,...,0.70,66.51,93.78,0.685670,0.655804,0.82,27.27,-3.738,1.271,740
347,SLC26A4,G6V,benign,,212.020,212.059,-3,-0.4,4.2,60.1,...,-1.75,72.69,139.56,0.749381,0.845818,-1.73,66.87,2.239,2.903,6
348,SLC26A4,I300L,benign,,208.623,207.343,2,4.5,3.8,166.7,...,-34.76,72.14,80.09,0.369949,0.419319,2.44,7.95,4.521,0.166,300


In [106]:

data = pd.get_dummies(data, columns=["secondary_structure"])

mapping = {"benign": 0, "pathogenic": 1}
data["pathogenicity"] = data["pathogenicity"].map(mapping)

data = data.drop(
    labels=["uniprot_id", "stability_WT", "stability_MUT", "hydrophobicity_WT", "hydrophobicity_MUT", "volume_WT",
            "volume_MUT", "sequence_length", "oda_MUT", "oda_WT", "sasa_WT", "sasa_MUT", "RSA_MUT", "gene",
            "protein_contain_transmembrane", "is_residue_transmembranal", "aa_WT", "aa_MUT"], axis=1, inplace=False)

data

Unnamed: 0,variant,pathogenicity,blosum,plddt_residue,stability_delta,hydrophobicity_delta,volume_delta,RSA_WT,oda_delta,sasa_delta,pssm,entropy,position,secondary_structure_Loop
0,G5R,0,-2,42.05,-0.174,-4.1,113.3,0.789691,3.52,122.07,3.169,2.538,5,1
1,G6S,0,0,43.98,0.019,-0.4,28.9,0.749381,1.71,35.44,2.239,2.903,6,1
2,S8L,0,-2,50.88,0.359,4.6,77.7,0.713007,-7.14,47.05,2.329,3.161,8,1
3,S17R,0,-1,57.11,1.545,-3.7,84.4,0.751259,1.78,91.04,2.589,2.106,17,1
4,M21V,0,1,85.90,-0.104,2.3,-22.9,0.867389,-6.77,-48.79,-0.901,2.771,21,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,S642P,0,-1,69.80,1.418,-0.8,23.7,0.391888,-1.04,15.36,3.696,2.115,642,1
346,G740S,0,0,41.01,-0.726,-0.4,28.9,0.685670,0.82,27.27,-3.738,1.271,740,1
347,G6V,0,-3,43.98,0.039,4.6,79.9,0.749381,-1.73,66.87,2.239,2.903,6,1
348,I300L,0,2,80.13,-1.280,-0.7,0.0,0.369949,2.44,7.95,4.521,0.166,300,1


In [107]:
# structural_features = ["oda_delta", "stability_delta", "secondary_structure_Beta strand", "secondary_structure_Helix", "secondary_structure_Loop", "sasa_delta", "RSA_WT"]
# # Drop structural features
# data = data.drop(labels=["stability_delta"], axis=1, inplace=False)

In [1]:

# Create list of all positions.
positions = data["position"].unique()
tps = []
fps = []
fns = []
tns = []
errors = []
mistakes = 0

counter = 0

# Initiate a dataframe that will contain the predictions for each variant.
predictions_df = pd.DataFrame(columns=["variant", "prediction", "reality"])

for pos in positions:
    counter += 1
    print(f"Position: {pos} ({counter}/{len(positions)})")
    # Create train and test sets.
    train = data[data["position"] != pos]
    test = data[data["position"] == pos]

    X_test = test.drop(labels=["variant", "pathogenicity", "position"], axis=1, inplace=False)
    y_test = test["pathogenicity"]

    # Oversamole the train set using SMOTE.
    X_train = train.drop(labels=["variant", "pathogenicity", "position"], axis=1, inplace=False)
    y_train = train["pathogenicity"]
    oversample = SMOTE(sampling_strategy='minority', random_state=42)
    X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)
    # class_distribution = y_train_resampled.value_counts()
    # print(f"Training set: (SMOTE)\n{class_distribution}")

    xgb_classifier = xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, random_state=42)
    xgb_classifier.fit(X_train_resampled, y_train_resampled)  # Fit the model with the resampled data
    y_pred = xgb_classifier.predict(X_test)
    
    # Add the predictions to the predictions dataframe.
    # If there's more than one variant in the test set, add the predictions for all of them.if len(test) > 1:
    for index, row in test.reset_index(drop=True).iterrows():
        # Ensure that the index is within the bounds of y_pred
        if index < len(y_pred):
            predictions_df = predictions_df.append({"variant": row["variant"], "prediction": y_pred[index], "reality": row["pathogenicity"]}, ignore_index=True)
        else:
            # For a single variant, use the first prediction in y_pred
            if len(y_pred) > 0:
                predictions_df = predictions_df.append({"variant": test["variant"].values[0], "prediction": y_pred[0], "reality": test["pathogenicity"].values[0]}, ignore_index=True)

    tp = sum((y_test == 1) & (y_pred == 1))
    fp = sum((y_test == 0) & (y_pred == 1))
    fn = sum((y_test == 1) & (y_pred == 0))
    tn = sum((y_test == 0) & (y_pred == 0))

    print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")

    tps.append(tp)
    fps.append(fp)
    fns.append(fn)
    tns.append(tn)

    print(f"tps: {sum(tps)}, fps: {sum(fps)}, fns: {sum(fns)}, tns: {sum(tns)}")

    print(f"Prediction: {y_pred}. Reality: {y_test.values}")

# Calculate MCC
TP = sum(tps)
FP = sum(fps)
FN = sum(fns)
TN = sum(tns)
mcc = (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

print(f"TP: {sum(tps)}, FP: {sum(fps)}, TN: {sum(fns)}, FN: {sum(tns)}")

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
precision = TP / (TP + FP)
accuracy = (TP + TN) / (TP + TN + FP + FN)
print(f"Results for {gene}:")
print(f"Sensitivity (Recall): {sensitivity}")
print(f"Specificity: {specificity}")
print(f"Precision: {precision}")
print(f"MCC for {gene}: {mcc}")
print(f"Accuracy: {accuracy}")

# Create prediction dataframe and save it to a csv file.
predictions_df.to_csv(f"{gene}_predictions_LOPO_XGB.csv", index=False)

NameError: name 'data' is not defined