# InSyBio - Data for Harry

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
import xgboost as xgb
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterGrid

In [3]:
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer

def Preprocessing(params, X, y): # FULL
    # Normalize=1
    normalize = params['normalization']
    if (normalize == 'Yes'):
        # Subset of columns to transform
        already_normalized = (X.max() == 1) & (X.min() == 0)
        set1 = set(X.columns)
        set2 = set(already_normalized)
        cols = list(set1 - set2)
        X_normalized = X
        X_normalized.loc[:,cols] = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(X_normalized[cols])
    else:
        X_normalized = X
        print('No normalization.')

    # Imputation
    imputation_method = params['imputation_method']
    #print(imputation_method)
    X_imputed = X_normalized
    if (imputation_method == 'zero'):
        X_imputed = X
        X_imputed.update(X_imputed.fillna(0))
    elif (imputation_method == 'simple'):
        imputer = SimpleImputer(strategy='mean') # 'mean' 'most_frequent'

        X_imputed = imputer.fit_transform(X)
        X_imputed = pd.DataFrame(X_imputed, columns = imputer.get_feature_names_out()) 
    elif (imputation_method == 'knn'):
       # Define KNN imputer and fill missing values
        imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

        X_imputed = imputer.fit_transform(X)
        X_imputed = pd.DataFrame(X_imputed, columns = imputer.get_feature_names_out())
    elif (imputation_method == 'mice'):
        # Define MICE Imputer and fill missing values
        imputer = IterativeImputer(estimator=linear_model.BayesianRidge(), max_value = 1, min_value=0, n_nearest_features=None, imputation_order='ascending', max_iter=50)

        X_imputed = imputer.fit_transform(X)
        X_imputed = pd.DataFrame(X_imputed, columns=imputer.get_feature_names_out())
    else:
        X_imputed = X
        print('No imputation.')
                
    # Instance selection
    instance_selection = params['instance_selection']
    #print(instance_selection)
    if (imputation_method != 'None'):
        if (instance_selection == 'local'):
            lof = LocalOutlierFactor()
            yhat = lof.fit_predict(X_imputed)

            # select all rows that are not outliers
            mask = pd.Series(yhat != -1)

            X_imputed = X_imputed[mask.values]
            y_imputed = y[mask.values]
        elif (instance_selection == 'isolation_forest'):
            clf = IsolationForest(n_estimators=10, warm_start=True)
            yhat = clf.fit_predict(X_imputed)  # fit 10 trees  
            #clf.set_params(n_estimators=20)  # add 10 more trees  
            #clf.fit(X)  # fit the added trees  

            # select all rows that are not outliers
            mask = pd.Series(yhat != -1)

            X_imputed = X_imputed[mask.values]
            y_imputed = y[mask.values]
        else:
            print('No instance selection.')
            y_imputed = y
    else:
            print('No instance selection without imputation.')
            y_imputed = y
        
    # Feature selection
    feature_selection = params['feature_selection']
    #print(feature_selection)
    if (feature_selection == 'best'):       
        X_new = SelectKBest(f_classif, k=params['k']).fit(X_imputed, y_imputed)
        full_filtered_features = X_new.get_feature_names_out()
        print(full_filtered_features)
        X_imputed = X_imputed[full_filtered_features]
    else:
        print('No feature selection.')
        
    return X_imputed, y_imputed

In [6]:
df_train = pd.read_csv("C:\\Users\\harry\\Desktop\\train_ds.csv")
df_test = pd.read_csv("C:\\Users\\harry\\Desktop\\test_dataset\\TEST_DATASET_PROCESSED.csv")
#df_test=df_test.drop(['uidA', 'uidB'])

features_all = ['BP_similarity', 'MF_similarity', 'CC_similarity',  # drop homologous features
            'Exists in MINT?', 'Exists in DIP?', 'Exists in APID?',
            'Exists in BIOGRID?', 'Sequence_similarity', 'pfam_interaction',
            'MW dif', 'Aromaticity dif', 'Instability dif', 'helix_fraction_dif', 'turn_fraction_dif',
            'sheet_fraction_dif', 'cys_reduced_dif', 'gravy_dif', 'ph7_charge_dif', 'A %', 'L %', 'F %', 'I %', 'M %', 'V %',
            'S %', 'P %', 'T %', 'Y %', 'H %', 'Q %', 'N %', 'K %', 'D %', 'E %', 'C %', 'W %', 'R %', 'G %', 
            'GSE227375_spearman', 'GSE228702_spearman']

features_not_exists = ['BP_similarity', 'MF_similarity', 'CC_similarity',
            'Sequence_similarity', 'pfam_interaction',
            'MW dif', 'Aromaticity dif', 'Instability dif', 'helix_fraction_dif', 'turn_fraction_dif',
            'sheet_fraction_dif', 'cys_reduced_dif', 'gravy_dif', 'ph7_charge_dif',  'A %', 'L %', 'F %', 'I %', 'M %', 'V %',
            'S %', 'P %', 'T %', 'Y %', 'H %', 'Q %', 'N %', 'K %', 'D %', 'E %', 'C %', 'W %', 'R %', 'G %', 
            'GSE227375_spearman', 'GSE228702_spearman']

X_train_all = df_train[features_all]
y_train_all = df_train['PPI_type']

X_train_not_exists = df_train[features_not_exists]
y_train_not_exists = df_train['PPI_type']

X_test_all = df_test[features_all]
y_test_all = df_test['PPI_type']

X_test_not_exists = df_test[features_not_exists]
y_test_not_exists = df_test['PPI_type']

In [7]:
from scipy import stats

for i in features_all:
    value = stats.ks_2samp(X_train_all[i], X_test_all[i])
    if (value[1] <= 0.05):
        print(i, value[1]) # Same distribution...

BP_similarity 5.320367398649628e-116
MF_similarity 5.320367398649628e-116
CC_similarity 5.320367398649628e-116
Exists in MINT? 0.007107157790916954
Exists in APID? 7.285269393806746e-12
Exists in BIOGRID? 3.313452490087753e-75
pfam_interaction 9.8095001936916e-78
MW dif 0.0
Aromaticity dif 0.0
Instability dif 0.0
helix_fraction_dif 0.0
turn_fraction_dif 3.1595208300568364e-141
sheet_fraction_dif 0.0
cys_reduced_dif 0.0
gravy_dif 0.0
ph7_charge_dif 0.0
A % 0.0
L % 0.0
F % 0.0
I % 0.0
M % 0.0
V % 0.0
S % 0.0
P % 0.0
T % 0.0
Y % 0.0
H % 0.0
Q % 0.0
N % 1.8357566280008424e-145
K % 0.0
D % 0.0
E % 0.0
C % 0.0
W % 0.0
R % 0.0
G % 0.0
GSE227375_spearman 0.0
GSE228702_spearman 0.0


In [8]:
params = {'imputation_method': 'simple',
          'normalization': 'Yes',
          'feature_selection': 'None',
          'k': 0,
          'instance_selection': 'None',
          'ml_algorithm': 'xgboost'}

In [9]:
X_imputed_train, y_imputed_train = Preprocessing(params, X_train_all, y_train_all)
    
clf_FULL = xgb.XGBClassifier(n_estimators=50, objective='binary:logistic', random_state=1, 
                             tree_method='hist', eta=0.3, gamma=0.4, max_depth=10)
clf_FULL.fit(X_imputed_train, y_imputed_train)

conf_matrix = confusion_matrix(y_train_all, clf_FULL.predict(X_train_all))

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

conf_matrix_test = confusion_matrix(y_test_all, clf_FULL.predict(X_test_all))

# Calculate specificity
tn, fp, fn, tp = conf_matrix_test.ravel()
specificity_test = tn / (tn + fp)
print("\ Metrics for Training Set:")
print("Accuracy:", accuracy_score(y_train_all, clf_FULL.predict(X_train_all)))
print("Specificity:", specificity)
print("Recall:", recall_score(y_train_all, clf_FULL.predict(X_train_all)))
print("F1 Score:", f1_score(y_train_all, clf_FULL.predict(X_train_all)))
print("F2 Score:", fbeta_score(y_train_all, clf_FULL.predict(X_train_all), beta=2))
print("ROC-AUC Score:", roc_auc_score(y_train_all, clf_FULL.predict(X_train_all)))
print("\ Metrics for Test Set:")
print("Accuracy:", accuracy_score(y_test_all, clf_FULL.predict(X_test_all)))
print("Specificity:", specificity_test)
print("Recall:", recall_score(y_test_all, clf_FULL.predict(X_test_all)))
print("F1 Score:", f1_score(y_test_all, clf_FULL.predict(X_test_all)))
print("F2 Score:", fbeta_score(y_test_all, clf_FULL.predict(X_test_all), beta=2))
print("ROC-AUC Score:", roc_auc_score(y_test_all, clf_FULL.predict(X_test_all)))

clf_FULL.save_model('train_full_features.json')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_normalized.loc[:,cols] = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(X_normalized[cols])
  X_normalized.loc[:,cols] = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(X_normalized[cols])


No instance selection.
No feature selection.
\ Metrics for Training Set:
Accuracy: 0.8814064362336115
Specificity: 1.0
Recall: 0.6442193087008343
F1 Score: 0.7836172526277637
F2 Score: 0.693571153599384
ROC-AUC Score: 0.8221096543504172
\ Metrics for Test Set:
Accuracy: 0.7562217416945154
Specificity: 0.9863164517143755
Recall: 0.5261813035616008
F1 Score: 0.6834137499680886
F2 Score: 0.5795124907996708
ROC-AUC Score: 0.7562488776379882


In [7]:
from sklearn.model_selection import cross_validate

def specificity_scorer(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return specificity
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'specificity': make_scorer(specificity_scorer),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'f2': make_scorer(lambda y_true, y_pred: fbeta_score(y_true, y_pred, beta=2)),
    'roc_auc': make_scorer(roc_auc_score)
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
results = cross_validate(estimator=clf_FULL, X=X_imputed_train, y=y_imputed_train, 
                cv=cv,scoring=scoring, n_jobs = 1, verbose = 2)
print("CV Mean Metrics:")
for metric, values in results.items():
    mean_value = np.mean(values)
    print(f"{metric}: {mean_value}")

print("\nCV Standard Deviations:")
for metric, values in results.items():
    std_value = np.std(values)
    print(f"{metric}: {std_value}")



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   0.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.2s
[CV] END .................................................... total time=   0.3s
CV Mean Metrics:
fit_time: 0.30356712341308595
score_time: 0.028536319732666016
test_accuracy: 0.9707994098898671
test_specificity: 0.9657364960909739
test_precision: 0.93483821336077
test_recall: 0.9809345594525235
test_f1: 0.9572793806278668
test_f2: 0.9713200805379005

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.3s finished
