In [66]:
#!/usr/bin/python
import sys, math, numpy as np, pandas as pd

# /Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/obs/
# /Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/pred/

def get_three_cm():
    '''
    Calculates a three class matrix.
    [P_HH, P_HE, P_HC] 
    [P_EH, P_EE, P_EC] 
    [P_CH, P_CE, P_CC]
    '''
    three_class_m = np.zeros((3,3)) #building zero matrix first
    return three_class_m

a = get_three_cm()
a

def read_dssp(dssp):
    '''
    Takes dssp file as input and returns the string describing the secondary structure.
    '''
    with open(dssp) as ssfile: 
        return ssfile.readlines()[1]


def fill_three_class_m(obs_file1, pred_file2, zero_matrix):
    '''
    First 2 arguments are 2 fastalike files. The last argument is a three-class-matrix
    containing zeroes only.
    '''
    obs = read_dssp(obs_file1)
    pred = read_dssp(pred_file2)
    
    if len(obs) != len(pred):
        return print('Error: strings are NOT the same length\n observed:', obs_file1+'\n predicted:', pred_file2)
    
    coord_di = {'HH':(0,0), 'HE':(0,1), 'HC':(0,2),
               'EH':(1,0), 'EE':(1,1), 'EC':(1,2),
               'CH':(2,0), 'CE':(2,1), 'CC':(2,2)}
    
    for char in range(len(obs)):
        pair = obs[char]+pred[char]
        
        zero_matrix[coord_di[pair]]+=1
    return zero_matrix

In [19]:
fill_three_class_m('/Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/blindset/blind_dssp/4uiq.dssp', '/Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/obs/1111.dssp', a)

Error: strings are NOT the same length
 observed: /Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/blindset/blind_dssp/4uiq.dssp
 predicted: /Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/obs/1111.dssp


## Transforming the 3 - Class Matrix

I decided to put the 3 class array into a pandas data frame. That way I can access the fields by column. Each of the 3 class matrix must be converted as shown in the image.

![](./imgs/conversions.png)


In [3]:
a = fill_three_class_m('/Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/obs/1111.dssp', '/Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/pred/1111.dssp', a)
a
                   

array([[4., 0., 8.],
       [0., 2., 1.],
       [0., 2., 9.]])

### Converting the array a into pd df

In [4]:
df_hec = pd.DataFrame(a, columns=['H', 'E', 'C'], index=['H', 'E', 'C'])
df_hec


Unnamed: 0,H,E,C
H,4.0,0.0,8.0
E,0.0,2.0,1.0
C,0.0,2.0,9.0


### Converting the **H** 3 Class Matrix to a 2 Class Matrix

In [5]:
# initiate 2-class zero array. To be used 3 times!
zero2 = np.zeros((2,2))
zero2

# df for 2-class h
df2 = pd.DataFrame(zero2, columns=['pred_SS', 'pred_not'], index=['obs_SS', 'obs_not'])
df2


Unnamed: 0,pred_SS,pred_not
obs_SS,0.0,0.0
obs_not,0.0,0.0


In [6]:
display(df_hec)

df2_h = df2.copy()
df2_e = df2.copy()
df2_c = df2.copy()

# Transpose H3 to H2
# Correct predictions m[0,0]
df2_h['pred_SS']['obs_SS'] = df_hec['H']['H']
df2_e['pred_SS']['obs_SS'] = df_hec['E']['E']
df2_c['pred_SS']['obs_SS'] = df_hec['C']['C']
display(df2_h)
display(df2_e)
display(df2_c)

Unnamed: 0,H,E,C
H,4.0,0.0,8.0
E,0.0,2.0,1.0
C,0.0,2.0,9.0


Unnamed: 0,pred_SS,pred_not
obs_SS,4.0,0.0
obs_not,0.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,2.0,0.0
obs_not,0.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,9.0,0.0
obs_not,0.0,0.0


In [7]:
# Under-predictions m[0,1]
# H: From row H -> range E:C
df2_h['pred_not']['obs_SS']= df_hec.loc['H','E':'C'].sum()
# E: From row E -> H+C sum of 
df2_e['pred_not']['obs_SS']= df_hec['H']['E']+ df_hec['C']['E']
# C: From row C -> H:E
df2_c['pred_not']['obs_SS']= df_hec.loc['C','H':'E'].sum()

display(df2_h)
display(df2_e)
display(df2_c)
display(df_hec)

Unnamed: 0,pred_SS,pred_not
obs_SS,4.0,8.0
obs_not,0.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,2.0,1.0
obs_not,0.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,9.0,2.0
obs_not,0.0,0.0


Unnamed: 0,H,E,C
H,4.0,0.0,8.0
E,0.0,2.0,1.0
C,0.0,2.0,9.0


In [164]:
    # col  # row
df_hec['E']['H']

0.0

In [8]:
# Over predictions m[1,0] Not SS but reported as SS 
# GELB
# H: column H from E:C
df2_h['pred_SS']['obs_not'] = df_hec.loc['E':'C','H'].sum()
# E: column E from C+H
df2_e['pred_SS']['obs_not'] = df_hec['E']['C']+ df_hec['E']['H']
# C: column C from H:E
df2_c['pred_SS']['obs_not'] = df_hec.loc['H':'E', 'C'].sum()


Unnamed: 0,pred_SS,pred_not
obs_SS,4.0,8.0
obs_not,0.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,2.0,1.0
obs_not,2.0,0.0


Unnamed: 0,pred_SS,pred_not
obs_SS,9.0,2.0
obs_not,9.0,0.0


Unnamed: 0,H,E,C
H,4.0,0.0,8.0
E,0.0,2.0,1.0
C,0.0,2.0,9.0


In [33]:
# Negatives Not SS and NOT predicted as SS
# H: range E:C and E:C need values to make sure it turns into one number only!!!
df2_h['pred_not']['obs_not'] = df_hec.loc['E':'C','E':'C'].values.sum()
# E: only 00, 02 und 20, 22
df2_e['pred_not']['obs_not'] = df_hec['H']['H']+df_hec['H']['C']+df_hec['C']['H']+df_hec['C']['C']
# C: range H:E and H:E
df2_c['pred_not']['obs_not'] = df_hec.loc['H':'E', 'H':'E'].values.sum()

display(df2_h)
display(df2_e)
display(df2_c)
display(df_hec)

Unnamed: 0,pred_SS,pred_not
obs_SS,4.0,8.0
obs_not,0.0,14.0


Unnamed: 0,pred_SS,pred_not
obs_SS,2.0,1.0
obs_not,2.0,21.0


Unnamed: 0,pred_SS,pred_not
obs_SS,9.0,2.0
obs_not,9.0,6.0


Unnamed: 0,H,E,C
H,4.0,0.0,8.0
E,0.0,2.0,1.0
C,0.0,2.0,9.0


In [7]:
!ls /Users/ila/01-Unibo/02_Lab2/files_lab2_project/all_data/test_data/prediction/pred/

1111.dssp 2222.dssp


In [None]:
#     n = 0
#     m = 0
#     kind = 1
#     for i in data: # for every list in 'data'
#         if i[eval]<th and i[kind]==1: # lower than threshold and true positive, 1
#             cm[0][0] += 1
#         if i[eval]>=th and i[kind]==1: # false negative
#             cm[1][0] += 1
#             n +=1
#             if n < 10:
#                 print(i[2], 'FN') # to save the ID  of the FN
#         if i[eval]<th and i[kind]==0: # true negative
#             cm[0][1] += 1
#             m += 1
#             if m < 10:
#                 print(i[2], 'FP') # to save ID of the FP
#         if i[eval]>=th and i[kind] ==0:
#             cm[1][1] += 1
#     return cm

# def accuracy(cm): 
#     '''Takes the confusion matrix as input and calculates the accuracy
#     (TP + TN) / (TP + FP + FN + TN) if denominator is zero the result must be set to 0'''
#     if cm[0][0] == 0: return 0
#     return float(m[0][0]+m[1][1])/(sum(m[0])+sum(m[1]))

# def matthew_cc(m):
#     '''Takes the confusion matrix as input and returns the Matthews correlation coefficient'''
#     d=(m[0][0]+m[1][0])*(m[0][0]+m[0][1])*(m[1][1]+m[1][0])*(m[1][1]+m[0][1])
#     if d == 0: d = 1
#     return float((m[0][0]*m[1][1]-m[0][1]*m[1][0])/math.sqrt(d))

# def tpr(cm):
#     ''' returns the true positive rate'''
#     if cm[0][0] == 0: return 0
#     return cm[0][0]/(cm[0][0]+cm[1][0])

# def fpr(cm):
#     ''' returns the false positive rate'''
#     if cm[0][1] == 0: return 0
#     return cm[0][1]/(cm[0][1]+cm[1][1])

# def tnr(cm):
#     ''' returns the true negative rate'''
#     if cm[1][1] == 0: return 0
#     return cm[1][1]/(cm[1][1]+cm[0][1])

# def ppv(cm):
#     '''returns the positive predictive value = tp/ (tp + fp) = n of tp / all positive calls'''
#     if cm[0][0] == 0: return 0
#     return cm[0][0]/(cm[0][0]+cm[0][1])

# def npv(cm):
#     '''returns the negative predictive value = tn/ (tn + fn) = n of tn / all negative calls'''
#     if cm[1][1] == 0: return 0
#     return cm[1][1]/(cm[1][1]+cm[1][0])

# if __name__== "__main__":
#     filename=sys.argv[1]
#     #th=float(sys.argv[2])
#     data = get_hmm(filename)
#     for i in range(200): # provides 200 diff e-vla threasholds
#         th=10**-i  # initial threashold: th
#         cm= get_conf_mtrx(data,th) # takes eval and class (neg or pos)
#         print('Threshold:',th,'\nACC:', accuracy(cm),'\nMatthews:',matthew_cc(cm), "\nTPR:", tpr(cm), '\nFPR:', fpr(cm), '\nTNR:', tnr(cm), '\nPositivePredVal:', ppv(cm), '\nNegPredVal:', npv(cm), "\nThe Matrix:", cm,)
        


In [41]:
def accuracy(cm): 
    '''Takes the confusion matrix as input and calculates the accuracy
    [['TP'],['FN']
    ['FP'], ['TN']]
    (TP + TN) / (TP + FP + FN + TN) if denominator is zero the result must be set to 0'''
    if cm[0][0] == 0: return 0
    return float(cm[0][0]+cm[1][1])/(sum(cm[0])+sum(cm[1]))


In [43]:
# accuracy(df2_h.values)
df2_c


Unnamed: 0,pred_SS,pred_not
obs_SS,9.0,2.0
obs_not,9.0,6.0


In [None]:
def Sen_tpr(cm):
    '''
    Returns the Sensitivity aka True Positive Rate TPR.
    TPR = TP/(TP+FN)
    '''
    if cm[0][0] == 0: 
        return 0
    return cm[0][0]/(cm[0][0]+cm[0][1])

def ppv(cm):
    '''
    Returns the positive predictive value = tp/ (tp + fp) = n of tp / all positive calls'''
    if cm[0][0] == 0: 
        return 0
    return cm[0][0]/(cm[0][0]+cm[1][0])

def matthew_cc(m):
    '''Takes the confusion matrix as input and returns the Matthews correlation coefficient'''
    d=(m[0][0]+m[1][0])*(m[0][0]+m[0][1])*(m[1][1]+m[1][0])*(m[1][1]+m[0][1])
    if d == 0: 
        d = 1
    return float((m[0][0]*m[1][1]-m[0][1]*m[1][0])/math.sqrt(d))

def npv(cm):
    '''returns the negative predictive value = tn/ (tn + fn) = n of tn / all negative calls'''
    if cm[1][1] == 0: return 0
    return cm[1][1]/(cm[1][1]+cm[1][0])

def tnr(cm):
    ''' 
    Returns the true negative rate'''
    if cm[1][1] == 0: return 0
    return cm[1][1]/(cm[1][1]+cm[0][1])