# Compute performance metrics for the given Y and Y_score without sklearn

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data_performance_metrix/5_b.csv')

In [3]:
data['y'].value_counts()

0.0    10000
1.0      100
Name: y, dtype: int64

In [4]:
data.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [5]:
labels = [int(p>=0.5) for p in data['proba'].values]  ##converting probability to classes

In [6]:
data['pred'] = labels ##svaing predicted labels to the data frame

In [7]:
data.head()

Unnamed: 0,y,proba,pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


### Computing confusion metrix:

In [8]:
fp,tp,fn,tn = 0,0,0,0
for i, groundtruth in enumerate(data['y'].values):
    if groundtruth ==1.0 and data['pred'].values[i]==1: ##for true positive
        tp = tp+1
    elif groundtruth ==0.0 and data['pred'].values[i]==1:  ##for false positive
        fp = fp+1
    elif groundtruth ==1.0 and data['pred'].values[i]==0: ## for false negative
        fn = fn+1 
    elif groundtruth ==0.0 and data['pred'].values[i]==0: ## for true negative
        tn = tn+1        

In [9]:
print("True Positive:\t",tp)
print("False Positive:\t",fp)
print("True Negative:\t",tn)
print("False Negative:\t",fn)

True Positive:	 55
False Positive:	 239
True Negative:	 9761
False Negative:	 45


In [10]:
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,accuracy_score #confusion_metric
confusion_matrix(data['y'],data['pred'])

array([[9761,  239],
       [  45,   55]], dtype=int64)

### computing the f1_score:

In [11]:
##calculating precision and recall 
precision = tp/(tp+fp)
recall = tp/(tp+fn)

In [12]:
#calculating f1score
f1 = (2*precision*recall)/(precision+recall)
print("F1_Score calculated manually:\t",f1)

F1_Score calculated manually:	 0.2791878172588833


In [13]:
print("F1 Score using sklearn:\t",f1_score(data['y'],data['pred']))

F1 Score using sklearn:	 0.2791878172588833


### computing AUC

In [14]:
d1 = data.sort_values(['proba'],ascending=False)  #sorting the data based on the probability values

In [15]:
d1.head()

Unnamed: 0,y,proba,pred
8446,1.0,0.595294,1
1978,1.0,0.594808,1
1657,1.0,0.592198,1
110,1.0,0.590171,1
8578,1.0,0.588718,1


In [16]:
from tqdm import tqdm
tpr,fpr = [],[]
for j in tqdm(range(d1.shape[0])):
    labels = [int(p>=d1['proba'].values[j]) for p in d1['proba'].values]
    fp,tp,fn,tn = 0,0,0,0
    for i, groundtruth in enumerate(d1['y'].values):
        ## conditions for tp,fp,fn,tn
        if groundtruth ==1.0 and labels[i]==1:
            tp = tp+1
        elif groundtruth == 0.0 and labels[i]==1:
            fp = fp+1
        elif groundtruth ==1.0 and labels[i]==0:
            fn = fn+1
        elif groundtruth ==0.0 and labels[i]==0:
            tn = tn+1
    true_rate, false_rate = tp/(tp+fn),fp/(tn+fp) 
    tpr.append(true_rate)
    fpr.append(false_rate)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [06:12<00:00, 29.59it/s]


In [17]:
print("Manually calculated AUC ROC Score:\t",np.trapz(tpr,fpr))

Manually calculated AUC ROC Score:	 0.9377570000000001


In [18]:
print("AUC ROC Score using sklearn:\t",roc_auc_score(d1['y'],d1['proba']))

AUC ROC Score using sklearn:	 0.9377570000000001


In [19]:
import matplotlib.pyplot as plt
plt.plot(fpr,tpr)
plt.ylabel("True Positive Rate",size=15)
plt.xlabel("False Positive Rate",size=15)
plt.title("AUC ROC Curve")
plt.show()

<Figure size 640x480 with 1 Axes>

### computing the accuracy score:
1. we have already calculated the predicted classes, by taking .50 as threshold

In [20]:
data.head()

Unnamed: 0,y,proba,pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


In [21]:
corr_classified = 0
for i, groundtruth in enumerate(data['y'].values):
        if (groundtruth ==1.0 and data['pred'].values[i]==1) or (groundtruth ==0.0 and data['pred'].values[i]==0):
            corr_classified+=1
print("Manually calculated accuracy score\t",corr_classified/data.shape[0])

Manually calculated accuracy score	 0.9718811881188119


In [22]:
print("Accuracy score using sklearn,\t",accuracy_score(data['y'],data['pred']))

Accuracy score using sklearn,	 0.9718811881188119


### Best Threshold of probability for 5_C 

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [23]:
d1 = pd.read_csv('data_performance_metrix/5_c.csv')

FileNotFoundError: [Errno 2] File 5_c.csv does not exist: '5_c.csv'

In [None]:
d1.head()

In [None]:
from tqdm import tqdm
tpr,fpr = [],[]
opt_threshold = 0
opt_score = 999999999999999999 ##very large number for comparison
for j in tqdm(range(d1.shape[0])):
    threshold = d1['prob'].values[j]  ##taking each threshold one by one
    labels = [int(p>=threshold) for p in d1['prob'].values]  #converting probabilities to class
    fp,fn = 0,0
    for i, groundtruth in enumerate(d1['y'].values):
        ##calculating fp and fn
        if groundtruth == 0.0 and labels[i]==1:
            fp = fp+1
        if groundtruth ==1.0 and labels[i]==0:
            fn = fn+1
    𝐴 = 500*fn + 100*fp
    #choosing optimal scoe and threshold
    if A<opt_score:
        opt_score = A
        opt_threshold = threshold

In [None]:
print("Optimal Threshold:\t",opt_threshold)
print("Opt Score at optimal threshold:\t",opt_score)

### Performance metrices for regression:

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv('data_performance_metrix/5_d.csv')

In [None]:
data.head()

In [None]:
def meansquarederror(y1,y2):
    error = 0
    for i,val in enumerate(y1):
        error = error + (val-y2[i])**2
    return error/len(y1)

In [None]:
error = meansquarederror(data['y'].values,data['pred'].values)
print("Manually calculated mean squared error:\t",error)

In [None]:
print("Mean squared error using sklearn:\t",mean_squared_error(data['y'].values,data['pred'].values))

#### calculating MAPE

In [None]:
import math
def MAPE(y1,y2):
    error = 0
    for i,val in enumerate(y1):
        error = error + abs(val-y2[i])/np.mean(y1)  ##to avoid the division by zero error we have added 1 to the denominator
    return (error/len(y1))*100

In [None]:
error = MAPE(data['y'].values,data['pred'].values)
print("Manually calculated Mean Absolute Percentage Error:\t",error)

In [None]:
#reference : https://en.wikipedia.org/wiki/Coefficient_of_determination
def r2square(y1,y2):
    residual = y1-y2
    mean = sum(y1)/len(y1)
    sstot,ssreg,ssres = 0,0,0
    for i,val in enumerate(y1):
        sstot = sstot + (val-mean)**2 
    for i,val in enumerate(y2):
        ssreg = ssreg + (val-mean)**2
    for i,val in enumerate(y1):
        ssres = ssres + (val-y2[i])**2 
        
    r2 = 1 - (ssres/sstot)
    return r2

In [None]:
error = r2square(data['y'].values,data['pred'].values)
print("Manually calculated R^2 squared error:\t",error)

In [None]:
from sklearn.metrics import r2_score

In [None]:
print("R^2 error calculated using sklearn:\t",r2_score(data['y'].values,data['pred'].values))

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>