# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
# write your code here
data = pd.read_csv('5_a.csv')
print(data.shape)
data.head(3)

(10100, 2)


Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586


In [3]:
data['y_pred'] = (data.proba>=0.5).map({True:1,False:0})
print(data['y_pred'].value_counts())
data.head(3)

1    10100
Name: y_pred, dtype: int64


Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1


In [4]:
# 1. Confusion Matrix calculation
tn = ((data['y']==0)&(data['y_pred']==0)).sum()
fn = ((data['y']==1)&(data['y_pred']==0)).sum()
fp = ((data['y']==0)&(data['y_pred']==1)).sum()
tp = ((data['y']==1)&(data['y_pred']==1)).sum()

confusion_matrix = np.array([[tn,fn],[fp,tp]])
print(confusion_matrix)

[[    0     0]
 [  100 10000]]


In [5]:
# 2. F1-Score calcuation
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*(precision*recall/(precision+recall))
print(f1_score)

0.9950248756218906


In [6]:
# 4. Accuracy score calculation
accuracy_score = (tn+tp)/(tn+fn+fp+tp)
print(accuracy_score)

0.9900990099009901


In [7]:
# 3. AUC score calculation
tpr_array = []
fpr_array = []
sorted_data = data.sort_values(by='proba',ascending=False)
indices = list(range(0,sorted_data.shape[0],100))      #Taking only 1000 points else the computation takes lot of time
probabilities = [list(sorted_data['proba'])[i] for i in indices]

for prob in probabilities:
    y_pred = []
    threshold = prob
    for prob in list(sorted_data['proba']):
        if prob>=threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    y_pred = pd.Series(y_pred)
    tn = ((sorted_data['y']==0) & (y_pred==0)).sum()
    fn = ((sorted_data['y']==1) & (y_pred==0)).sum()
    fp = ((sorted_data['y']==0) & (y_pred==1)).sum()
    tp = ((sorted_data['y']==1) & (y_pred==1)).sum()
    
    tpr = tp/(tp+fn)
    fpr = fp/(tn+fp)
    
    tpr_array.append(tpr)
    fpr_array.append(fpr)
   
print(np.trapz(tpr_array, fpr_array))

0.5158


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [8]:
# write your code here
data = pd.read_csv('5_b.csv')
print(data.shape)
data.head(3)

(10100, 2)


Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793


In [9]:
data['y_pred'] = (data.proba>=0.5).map({True:1,False:0})
print(data['y_pred'].value_counts())
data.head(3)

0    9806
1     294
Name: y_pred, dtype: int64


Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0


In [10]:
# 1. Confusion Matrix calculation
tn = ((data['y']==0)&(data['y_pred']==0)).sum()
fn = ((data['y']==1)&(data['y_pred']==0)).sum()
fp = ((data['y']==0)&(data['y_pred']==1)).sum()
tp = ((data['y']==1)&(data['y_pred']==1)).sum()

confusion_matrix = np.array([[tn,fn],[fp,tp]])
print(confusion_matrix)

[[9761   45]
 [ 239   55]]


In [11]:
# 2. F1-Score calcuation
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*(precision*recall/(precision+recall))
print(f1_score)

0.2791878172588833


In [12]:
# 4. Accuracy score calculation
accuracy_score = (tn+tp)/(tn+fn+fp+tp)
print(accuracy_score)

0.9718811881188119


In [13]:
# 3. AUC score calculation
tpr_array = []
fpr_array = []
sorted_data = data.sort_values(by='proba',ascending=False)
indices = list(range(0,sorted_data.shape[0],100))     #Taking only 1000 points else the computation takes lot of time
probabilities = [list(sorted_data['proba'])[i] for i in indices]

for prob in probabilities:
    y_pred = []
    threshold = prob
    for prob in list(sorted_data['proba']):
        if prob>=threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    y_pred = pd.Series(y_pred)
    tn = ((sorted_data['y']==0) & (y_pred==0)).sum()
    fn = ((sorted_data['y']==1) & (y_pred==0)).sum()
    fp = ((sorted_data['y']==0) & (y_pred==1)).sum()
    tp = ((sorted_data['y']==1) & (y_pred==1)).sum()
    
    tpr = tp/(tp+fn)
    fpr = fp/(tn+fp)
    
    tpr_array.append(tpr)
    fpr_array.append(fpr)
   
print(np.trapz(tpr_array, fpr_array))

0.49734949999999994


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [14]:
data = pd.read_csv('5_c.csv')
print(data.shape)
data.head(3)

(2852, 2)


Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652


In [15]:
data['y_pred'] = (data.prob>=0.5).map({True:1,False:0})
print(data['y_pred'].value_counts())
data.head(3)

0    2099
1     753
Name: y_pred, dtype: int64


Unnamed: 0,y,prob,y_pred
0,0,0.458521,0
1,0,0.505037,1
2,0,0.418652,0


In [16]:
tn = ((data['y']==0)&(data['y_pred']==0)).sum()
fn = ((data['y']==1)&(data['y_pred']==0)).sum()
fp = ((data['y']==0)&(data['y_pred']==1)).sum()
tp = ((data['y']==1)&(data['y_pred']==1)).sum()

confusion_matrix = np.array([[tn,fn],[fp,tp]])
A = 500*fn+100*fp
print(confusion_matrix)
print('A = ',A)

[[1637  462]
 [ 168  585]]
A =  247800


In [17]:
 # write your code
A_Scores = []
Confusion_matrix = []
sorted_data = data.sort_values(by='prob',ascending=False)
indices = list(range(0,sorted_data.shape[0]))
probabilities = [list(sorted_data['prob'])[i] for i in indices]

for prob in probabilities:
    y_pred = []
    threshold = prob
    for prob in list(sorted_data['prob']):
        if prob>=threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    y_pred = pd.Series(y_pred)
    tn = ((sorted_data['y']==0) & (y_pred==0)).sum()
    fn = ((sorted_data['y']==1) & (y_pred==0)).sum()
    fp = ((sorted_data['y']==0) & (y_pred==1)).sum()
    tp = ((sorted_data['y']==1) & (y_pred==1)).sum()
    
    A_Scores.append(500*fn+100*fp)
    Confusion_matrix.append(np.array([[tn,fn],[fp,tp]]))

min_index = A_Scores.index(min(A_Scores))
min_threshold = probabilities[min_index]
   
print('Required Threshold: ',min_threshold)
print('Confusion Matrix:')
print(Confusion_matrix[min_index])
print('A = ',A_Scores[min_index])

Required Threshold:  0.02803798623987141
Confusion Matrix:
[[   0    0]
 [1805 1047]]
A =  180500


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [18]:
data = pd.read_csv('5_d.csv')
print(data.shape)
data.head(3)

(157200, 2)


Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0


In [24]:
#Mean Square Error Calculation
mse = ((data['y']-data['pred'])**2).sum()
print('Mean Square Error: ',mse/data.shape[0])

Mean Square Error:  177.16569974554707


In [29]:
#Mean absolute percentage error calculation
mape = abs(data['y']-data['pred']).sum()/data['y'].sum()
print('Mean Absolute Percentage Error: ',mape)

Mean Absolute Percentage Error:  0.1291202994009687
