# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
# write your code here
data = pd.read_csv("5_a.csv")
data.rename(columns={'y':'y_act'},inplace=True)
data.head(10)

Unnamed: 0,y_act,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
5,1.0,0.6016
6,1.0,0.666323
7,1.0,0.567012
8,1.0,0.65023
9,1.0,0.829346


In [3]:
data['y_pred'] = (data.proba >=0.5).astype('float')

In [4]:
data.shape

(10100, 3)

In [5]:
data.dtypes

y_act     float64
proba     float64
y_pred    float64
dtype: object

## Confusion Matrix

In [6]:
def compute_confusion_matrix(yact,ypred):
    tp = sum((yact == 1.0) & (ypred == 1.0))
    tn = sum((yact == 0.0) & (ypred == 0.0))
    fn = sum((yact == 1.0) & (ypred == 0.0))
    fp = sum((yact == 0.0) & (ypred == 1.0))
    return tp,tn,fp,fn

In [7]:
tp = ((data['y_act'] == 1) & (data['y_pred'] == 1))
tn = ((data['y_act'] == 0) & (data['y_pred'] == 0))
fp = ((data['y_act'] == 0) & (data['y_pred'] == 1))
fn = ((data['y_act'] == 1) & (data['y_pred'] == 0))


print(sum(tp))
print(sum(tn))
print(sum(fp))
print(sum(fn))

# imbalanced dataset

10000
0
100
0


In [8]:
tp,tn,fp,fn = compute_confusion_matrix(data.y_act, data.y_pred)
print(tp)
print(tn)
print(fp)
print(fn)

10000
0
100
0


In [9]:
from sklearn.metrics import confusion_matrix
tn,fp,fn,tp = confusion_matrix(data.y_act , data.y_pred).ravel() # converted 2d to 1d

In [10]:
print(tp)
print(tn)
print(fp)
print(fn)

10000
0
100
0


## Compute F1 score

In [11]:
def compute_precision(tp,fp):
    return tp/(tp+fp)
def compute_recall(tp,fn):
    return tp/(tp+fn)

In [12]:
tp/(tp+fp)

0.9900990099009901

In [13]:
tp/(tp+fn)

1.0

In [14]:
from sklearn.metrics import precision_score
precision_score(data.y_act,data.y_pred)

0.9900990099009901

In [15]:
from sklearn.metrics import recall_score
recall_score(data.y_act,data.y_pred)

1.0

In [16]:
def compute_f1_score(yact,ypred):
    tp,tn,fp,fn = compute_confusion_matrix(yact,ypred)
    precision = compute_precision(tp,fp)
    recall = compute_recall(tp,fn)
    f1_score = (2*precision*recall)/(precision + recall)
    return f1_score

In [17]:
compute_f1_score(data.y_act,data.y_pred)

0.9950248756218906

In [18]:
from sklearn.metrics import f1_score
f1_score(data.y_act,data.y_pred)

0.9950248756218906

## Accuracy Score

In [19]:
def compute_accuracy(tp,tn,fp,fn):
    return ((tp + tn)*100) / float(tp+tn+fn+fp)

In [20]:
compute_accuracy(tp,tn,fp,fn)

99.00990099009901

In [21]:
from sklearn.metrics import accuracy_score
print(accuracy_score(data.y_act,data.y_pred))

0.9900990099009901


In [22]:
pd.crosstab(data.y_act,data.y_pred)

y_pred,1.0
y_act,Unnamed: 1_level_1
0.0,100
1.0,10000


## AUC score

In [23]:
data.head()

Unnamed: 0,y_act,proba,y_pred
0,1.0,0.637387,1.0
1,1.0,0.635165,1.0
2,1.0,0.766586,1.0
3,1.0,0.724564,1.0
4,1.0,0.889199,1.0


In [25]:
#sort_by_life = gapminder.sort_values('lifeExp')
sorted_data = data.sort_values('proba',axis=0,ascending=False)
sorted_data.drop(columns = ['y_pred'], inplace = True)
print(sorted_data.head())

      y_act     proba
1664    1.0  0.899965
2099    1.0  0.899828
1028    1.0  0.899825
9592    1.0  0.899812
8324    1.0  0.899768


In [28]:
threshold = sorted_data['proba'].values
N = len(threshold)
print(N)


10100


In [None]:
from tqdm import trange
tpr = []
fpr = []
for i in trange(N):     # for each threshold value
    ypred = []
    for prob in sorted_data.proba:   # for each proba values 
        if prob < threshold[i]:   # if threshold value is greater than proba == 0
            ypred.append(0.0)
        else:
            ypred.append(1.0)
            
    #print(ypred)
    #print(sorted_data['y_act'].values)
    tp,tn,fp,fn = compute_confusion_matrix(sorted_data['y_act'].values,ypred)
    
    
    tpr_array = (tp) / (tp+fn)
    fpr_array = (fp) / (tn+fp)
    tpr.append(tpr_array)
    fpr.append(fpr_array)
    





  0%|          | 2/10100 [00:00<13:24, 12.55it/s]

  0%|          | 4/10100 [00:00<13:22, 12.58it/s]

  0%|          | 6/10100 [00:00<13:15, 12.68it/s]

  0%|          | 8/10100 [00:00<13:11, 12.76it/s]

  0%|          | 10/10100 [00:00<13:12, 12.74it/s]

  0%|          | 12/10100 [00:00<13:15, 12.68it/s]

  0%|          | 14/10100 [00:01<13:13, 12.70it/s]

  0%|          | 16/10100 [00:01<13:16, 12.65it/s]

  0%|          | 18/10100 [00:01<13:16, 12.66it/s]

  0%|          | 20/10100 [00:01<13:12, 12.72it/s]

  0%|          | 22/10100 [00:01<13:15, 12.66it/s]

  0%|          | 24/10100 [00:01<13:18, 12.62it/s]

  0%|          | 26/10100 [00:02<13:16, 12.64it/s]

  0%|          | 28/10100 [00:02<13:12, 12.71it/s]

  0%|          | 30/10100 [00:02<13:14, 12.68it/s]

  0%|          | 32/10100 [00:02<13:16, 12.63it/s]

  0%|          | 34/10100 [00:02<13:18, 12.60it/s]

  0%|          | 36/10100 [00:02<13:12, 12.70it/s]

  0%|          | 38/10100 [00:02<13:09, 12.75it/s]

  0%|       

  6%|▌         | 608/10100 [00:48<12:28, 12.69it/s]

  6%|▌         | 610/10100 [00:48<12:27, 12.69it/s]

  6%|▌         | 612/10100 [00:48<12:26, 12.71it/s]

  6%|▌         | 614/10100 [00:48<12:22, 12.78it/s]

  6%|▌         | 616/10100 [00:48<12:30, 12.63it/s]

  6%|▌         | 618/10100 [00:48<12:30, 12.63it/s]

  6%|▌         | 620/10100 [00:49<12:32, 12.60it/s]

  6%|▌         | 622/10100 [00:49<12:29, 12.65it/s]

  6%|▌         | 624/10100 [00:49<12:28, 12.66it/s]

  6%|▌         | 626/10100 [00:49<12:34, 12.55it/s]

  6%|▌         | 628/10100 [00:49<12:49, 12.32it/s]

  6%|▌         | 630/10100 [00:49<12:44, 12.38it/s]

  6%|▋         | 632/10100 [00:50<13:04, 12.07it/s]

  6%|▋         | 634/10100 [00:50<12:55, 12.20it/s]

  6%|▋         | 636/10100 [00:50<12:52, 12.26it/s]

  6%|▋         | 638/10100 [00:50<12:45, 12.36it/s]

  6%|▋         | 640/10100 [00:50<12:42, 12.41it/s]

  6%|▋         | 642/10100 [00:50<12:32, 12.57it/s]

  6%|▋         | 644/10100 [00:50<12:31, 12.58

 12%|█▏        | 1220/10100 [01:37<11:36, 12.76it/s]

 12%|█▏        | 1222/10100 [01:37<11:35, 12.76it/s]

 12%|█▏        | 1224/10100 [01:37<11:32, 12.81it/s]

 12%|█▏        | 1226/10100 [01:37<11:34, 12.78it/s]

 12%|█▏        | 1228/10100 [01:37<11:30, 12.85it/s]

 12%|█▏        | 1230/10100 [01:37<11:28, 12.88it/s]

 12%|█▏        | 1232/10100 [01:37<11:26, 12.92it/s]

 12%|█▏        | 1234/10100 [01:38<11:27, 12.90it/s]

 12%|█▏        | 1236/10100 [01:38<11:34, 12.76it/s]

 12%|█▏        | 1238/10100 [01:38<11:35, 12.74it/s]

 12%|█▏        | 1240/10100 [01:38<11:30, 12.82it/s]

 12%|█▏        | 1242/10100 [01:38<11:30, 12.83it/s]

 12%|█▏        | 1244/10100 [01:38<11:32, 12.79it/s]

 12%|█▏        | 1246/10100 [01:39<11:51, 12.45it/s]

 12%|█▏        | 1248/10100 [01:39<11:47, 12.51it/s]

 12%|█▏        | 1250/10100 [01:39<11:37, 12.69it/s]

 12%|█▏        | 1252/10100 [01:39<11:35, 12.71it/s]

 12%|█▏        | 1254/10100 [01:39<11:41, 12.61it/s]

 12%|█▏        | 1256/10100 

In [410]:
tp,tn,fp,fn = compute_confusion_matrix(y_pred,sorted_data.y_act)
print(tp)
print(tn)
print(fp)
print(fn)


0
0
0
0


In [394]:
tp,tn,fp,fn = compute_confusion_matrix(sorted_data.y_act,sorted_data.y_pred)
print(tp)
print(tn)
print(fp)
print(fn)

print((tp) / (tp+fn))

print((tp)*100 / float(tp+fn))

10000
0
100
0
1.0
100.0


In [398]:
print(sorted_data.y_act)

1664    1.0
2099    1.0
1028    1.0
9592    1.0
8324    1.0
       ... 
8294    1.0
1630    1.0
7421    1.0
805     1.0
5012    1.0
Name: y_act, Length: 10100, dtype: float64


In [411]:
print(ypred)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [0]:
# write your code

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [0]:
 # write your code

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>