# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
data = pd.read_csv('data_performance_metrix/5_a.csv')

In [3]:
data['y'].value_counts()

1.0    10000
0.0      100
Name: y, dtype: int64

In [4]:
data.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [5]:
labels = [int(p>=0.5) for p in data['proba'].values]

In [6]:
data['pred'] = labels

In [7]:
data.head()

Unnamed: 0,y,proba,pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


### Computing confusion metrix:

In [8]:
fp,tp,fn,tn = 0,0,0,0
for i, groundtruth in enumerate(data['y'].values):
    if groundtruth ==1.0 and data['pred'].values[i]==1: ##for tp
        tp = tp+1
    elif groundtruth ==0.0 and data['pred'].values[i]==1: ##for fp
        fp = fp+1
    elif groundtruth ==1.0 and data['pred'].values[i]==0: ##for fn
        fn = fn+1
    elif groundtruth ==0.0 and data['pred'].values[i]==0: ##for tn
        tn = tn+1        

In [9]:
print("True Positive:\t",tp)
print("False Positive:\t",fp)
print("True Negative:\t",tn)
print("False Negative:\t",fn)

True Positive:	 10000
False Positive:	 100
True Negative:	 0
False Negative:	 0


In [10]:
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,accuracy_score #confusion_metric
confusion_matrix(data['y'],data['pred'])

array([[    0,   100],
       [    0, 10000]], dtype=int64)

### computing the f1_score:

In [11]:
## calculating precision and recall to calculate f1_score
precision = tp/(tp+fp)
recall = tp/(tp+fn)

In [12]:
f1 = (2*precision*recall)/(precision+recall)
print("F1_Score calculated manually:\t",f1)

F1_Score calculated manually:	 0.9950248756218906


In [13]:
print("F1 Score using sklearn:\t",f1_score(data['y'],data['pred']))

F1 Score using sklearn:	 0.9950248756218906


### computing AUC

In [14]:
d1 = data.sort_values(['proba'],ascending=False)  #sorting the data based on the probability values

In [15]:
d1.head()

Unnamed: 0,y,proba,pred
1664,1.0,0.899965,1
2099,1.0,0.899828,1
1028,1.0,0.899825,1
9592,1.0,0.899812,1
8324,1.0,0.899768,1


In [16]:
from tqdm import tqdm
tpr,fpr = [],[]
for j in tqdm(range(d1.shape[0])):
    labels = [int(p>=d1['proba'].values[j]) for p in d1['proba'].values]  ##converting probability to class
    fp,tp,fn,tn = 0,0,0,0
    for i, groundtruth in enumerate(d1['y'].values):
        ##condition for tp,fp,tn,fn
        if groundtruth ==1.0 and labels[i]==1:
            tp = tp+1
        elif groundtruth == 0.0 and labels[i]==1:
            fp = fp+1
        elif groundtruth ==1.0 and labels[i]==0:
            fn = fn+1
        elif groundtruth ==0.0 and labels[i]==0:
            tn = tn+1
    #calculating tpr and fpr
    true_rate, false_rate = tp/(tp+fn),fp/(tn+fp) 
    tpr.append(true_rate)
    fpr.append(false_rate)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [06:10<00:00, 27.26it/s]


In [17]:
print("Manually calculated AUC ROC Score:\t",np.trapz(tpr,fpr))

Manually calculated AUC ROC Score:	 0.48829900000000004


In [18]:
print("AUC ROC Score using sklearn:\t",roc_auc_score(d1['y'],d1['proba']))

AUC ROC Score using sklearn:	 0.48829900000000004


In [19]:
import matplotlib.pyplot as plt
plt.plot(fpr,tpr)
plt.ylabel("True Positive Rate",size=15)
plt.xlabel("False Positive Rate",size=15)
plt.title("AUC ROC Curve")
plt.show()

<Figure size 640x480 with 1 Axes>

### computing the accuracy score:
1. we have already calculated the predicted values, by taking .50 as threshold

In [20]:
data.head()

Unnamed: 0,y,proba,pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [21]:
corr_classified = 0
for i, groundtruth in enumerate(data['y'].values):
        if (groundtruth ==1.0 and data['pred'].values[i]==1) or (groundtruth ==0.0 and data['pred'].values[i]==0):##finding number points correctly classified
            corr_classified+=1
print("Manually calculated accuracy score\t",corr_classified/data.shape[0])

Manually calculated accuracy score	 0.9900990099009901


In [22]:
print("Accuracy score using sklearn,\t",accuracy_score(data['y'],data['pred']))

Accuracy score using sklearn,	 0.9900990099009901
