In [None]:
from autoencoders import *
from pdb_utils import *

import pickle
import numpy as np
import pandas as pd
import random

import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from functools import partial
import sys

from Bio import PDB
import numpy as np
import os
import cv2

import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

import time

In [None]:
with open('pickle files/fold_groups.pkl', 'rb') as f:
    fold_dict = pickle.load(f)
with open('pickle files/label_dict.pkl', 'rb') as f:
    label_dict = pickle.load(f)

In [None]:
selected_folds = list(filter(lambda x: x[1]>=10 x[1]<=30, map(lambda x: (x[0],len(x[1])), fold_dict.items())))[:5]
len(selected_folds)

In [None]:
test_structures = []
for pdb in os.listdir("SCOP_Test/"):
    pdb_path = os.path.join("SCOP_Test", pdb)
    parser = PDB.PDBParser()
    structure = parser.get_structure(pdb, pdb_path)
    test_structures.append(structure)

print(len(test_structures))

In [None]:
s_time = time.time()
structures = []

for fold,_ in selected_folds:
    for pdb in fold_dict[fold]:
        pdb_path = os.path.join("PDBs", pdb+".pdb")
        parser = PDB.PDBParser()
        structure = parser.get_structure(pdb, pdb_path)
        structures.append(structure)
end = time.time()
nice_time(s_time,end)
print(len(structures))

In [None]:
results = {}
for strategy in ["strategy1","strategy2","strategy3"]:
    results[strategy] = []
    for filter_size in [32,64,96,128,256]:
        if strategy == "strategy1":
            # Resize
            matrixdict_s1 = DistanceMatrixDict(structures, resize_strategy="strategy1", resize_to=(filter_size,filter_size),removeSymmetry=True)
            
        elif strategy == "strategy2":
            matrixdict_s1 = DistanceMatrixDict(structures, resize_strategy="strategy2", resize_to=(filter_size,filter_size),sample_size=60)
        
        elif strategy == "strategy3":
            matrixdict_s1 = DistanceMatrixDict(structures, resize_strategy="strategy3", resize_to=(filter_size,filter_size))

        for encoding_size in [50,100,200,500]:
            pdb_names, features = list(matrixdict_s1.keys()), list(matrixdict_s1.values())
            input_size = len(features[0])
            
            # AutoEncoder
            new_features, loss = LinearAutoencoder(features, input_size, encoding_size, 100, learning_rate=0.0001)
            
            # Preparing X and y
            new_feature_dict = {}
            for i in enumerate(pdb_names):
                if "sample" in i[1]:
                    pdb = i[1].split("sample")[0]
                else:
                    pdb = i[1]
                new_feature_dict.setdefault(pdb,[])
                new_feature_dict[pdb].append(new_features[i[0]])

            X = []
            y_fold=[]
            for pdb,vector in new_feature_dict.items():
                X.append(np.average(vector,axis=0))
                y_fold.append(".".join(label_dict[pdb].split(".")[:2]))
            uniques = list(set(y_fold))
            group2id = dict(zip(uniques, range(len(uniques))))
            
            y = np.array(list(map(lambda x: group2id[x], y_fold)))
            X=np.array(X)
            
            # Fold Classification with Random Forest
            train_acc = 0
            test_acc = 0

            sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
            for train_index, test_index in sss.split(X, y):
                X_test, y_test = X[test_index], y[test_index]

                sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
                for a, b in sss1.split(X[train_index], y[train_index]):
                    X_train, y_train = X[train_index][a], y[train_index][a]
                    X_validation, y_validation = X[train_index][b], y[train_index][b]

                # Hyperparameter Optimization with validation set
                params = {'max_depth':[3,4,5,6,7,8,9,10,15,20], 
                      'criterion':('gini', 'entropy'), 
                      'warm_start':(True,False),
                     'n_estimators': (10,50,100,200,500)}

                rf = RandomForestClassifier(random_state=42)
                clf = GridSearchCV(rf, params, cv=2, refit=True)
                clf.fit(X_validation, y_validation)

                # Training best model with train set
                model = clf.best_estimator_
                model.fit(X_train, y_train)

                # Train and Test Accuracy Scores
                train_acc+= model.score(X_train,y_train)
                test_acc += model.score(X_test,y_test)

            train_acc = train_acc/5.
            test_acc = test_acc/5.
            
            
            # Saving Results
            results[strategy].append(((filter_size,encoding_size,input_size), (train_acc,test_acc)))
            print(((filter_size,encoding_size,input_size), (train_acc,test_acc)))

In [None]:
with open('pickle files/results.pkl', 'w') as f:
    pickle.dump(f, results)