# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
def confusion_matrix(y, yhat):
    TP = ((y == 1) & (yhat==1)).sum()
    TN = ((y == 0) & (yhat==0)).sum()
    FP = ((y == 0) & (yhat==1)).sum()
    FN = ((y == 1) & (yhat==0)).sum()
    return TP, TN, FP, FN
def Precision(y, yhat):
    TP, TN, FP, FN = confusion_matrix(y, yhat)
    return TP/(TP+FP)
def Recall(y, yhat):
    TP, TN, FP, FN = confusion_matrix(y, yhat)
    return TP/(TP+FN)
def F1score(y, yhat):
    precision = Precision(y,yhat)
    recall = Recall(y,yhat)
    return (2 * (precision*recall))/(precision+recall)
def Acccuracy(y, yhat):
    TP, TN, FP, FN = confusion_matrix(y, yhat)
    return (TP+TN)/(TP+TN+FP+FN)
def TPR(y,yhat):
    TP, TN, FP, FN = confusion_matrix(y, yhat)
    return TP/(TP+FN)
def FPR(y,yhat):
    TP, TN, FP, FN = confusion_matrix(y, yhat)
    return FP/(TN+FP)
def AUC(y,yhat_prob):
    FPR_array = []
    TPR_array = []
    for threshold in np.arange(0,1,0.001):
        yhat = yhat_prob.apply(lambda a : 1 if a > threshold else 0)
        TP, TN, FP, FN = confusion_matrix(y, yhat)
        tpr = TPR(y,yhat)
        fpr = 1 - FPR(y,yhat)
        FPR_array.append(fpr)
        TPR_array.append(tpr)
    return abs(np.trapz(FPR_array,TPR_array))

In [3]:
data_5_a = pd.read_csv(r'D:\appliedai\AppliedAi/5_a.csv')

In [4]:
data_5_a['proba_class'] = data_5_a['proba'].apply(lambda a : 1 if a > 0.5 else 0)

In [5]:
data_5_a['y'] = data_5_a['y'].astype(int)
data_5_a['proba_class'] = data_5_a['proba_class'].astype(int)

In [6]:
data_5_a.head()

Unnamed: 0,y,proba,proba_class
0,1,0.637387,1
1,1,0.635165,1
2,1,0.766586,1
3,1,0.724564,1
4,1,0.889199,1


In [7]:
TP, TN, FP, FN = confusion_matrix(data_5_a['y'], data_5_a['proba_class'])

f1score = F1score(data_5_a['y'], data_5_a['proba_class'])

acc = Acccuracy(data_5_a['y'], data_5_a['proba_class'])

auc = AUC(data_5_a['y'], data_5_a['proba'])
cm = pd.DataFrame({'Predicted:true':[TP, FN], 'Predicted:False':[FP, TN]},index = ['Actual : True','Actual : False'])
print("Confusion Matrix : \n", cm)
print("F1 score: {}".format(f1score))
print("Accuracy: {}".format(acc))
print("Area under the curve (auc): {}".format(auc))

Confusion Matrix : 
                 Predicted:true  Predicted:False
Actual : True            10000              100
Actual : False               0                0
F1 score: 0.9950248756218906
Accuracy: 0.9900990099009901
Area under the curve (auc): 0.488309


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [8]:
data_5_b = pd.read_csv(r'D:\appliedai\AppliedAi/5_b.csv')

In [9]:
data_5_b['proba_class'] = data_5_b['proba'].apply(lambda a : 1 if a > 0.5 else 0)

In [10]:
data_5_b['y'] = data_5_b['y'].astype(int)
data_5_b['proba_class'] = data_5_b['proba_class'].astype(int)

In [11]:
TP, TN, FP, FN = confusion_matrix(data_5_b['y'], data_5_b['proba_class'])
f1score = F1score(data_5_b['y'], data_5_b['proba_class'])
acc = Acccuracy(data_5_b['y'], data_5_b['proba_class'])
auc = AUC(data_5_b['y'], data_5_b['proba'])
cm = pd.DataFrame({'Predicted:true':[TP, FN], 'Predicted:False':[FP, TN]},index = ['Actual : True','Actual : False'])
print("Confusion Matrix : \n", cm)
print("F1 score: {}".format(f1score))
print("Accuracy: {}".format(acc))
print("Area under the curve (auc): {}".format(auc))

Confusion Matrix : 
                 Predicted:true  Predicted:False
Actual : True               55              239
Actual : False              45             9761
F1 score: 0.2791878172588833
Accuracy: 0.9718811881188119
Area under the curve (auc): 0.9377150000000001


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [12]:
data_5_c = pd.read_csv(r'D:\appliedai\AppliedAi/5_c.csv')
data_5_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [13]:
def a_metric(FN,FP):
    return ((500*FN)+(100*FP))

def best_threshold_with_metric_a(y,yhat_prob):
    A = []
    thres = []
    for threshold in np.arange(0,1,0.01):
        yhat = yhat_prob.apply(lambda a : 1 if a > threshold else 0)
        TP, TN, FP, FN = confusion_matrix(y, yhat)
        A.append(a_metric(FN,FP))
        thres.append(threshold)
    return thres[A.index(min(A))]

In [14]:
print(f"Best threshold with lowest A_metric : {best_threshold_with_metric_a(data_5_c['y'], data_5_c['prob'])}")

Best threshold with lowest A_metric : 0.23


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [15]:
data_5_d = pd.read_csv(r'D:\appliedai\AppliedAi/5_d.csv')
data_5_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [16]:
def mse(y,yhat):
    return(np.mean((y-yhat)**2))

def mape(y,yhat):
    return(np.mean(np.abs((y-yhat)/y))*100)

def r2(y,yhat):
    return(1 - (np.sum((y-yhat)**2)/(np.sum((y-np.mean(y))**2))))

In [17]:
print(f"Mean square error : {mse(data_5_d['y'], data_5_d['pred'])}")
print(f"Mean absolute percentage error : {mape(data_5_d['y'], data_5_d['pred'])}")
print(f"R-square : {r2(data_5_d['y'], data_5_d['pred'])}")

Mean square error : 177.16569974554707
Mean absolute percentage error : inf
R-square : 0.9563582786990937
