# Assignment 5: Compute Performance metrics without Sklearn

Name: Devendra Bharti<br>
Email: kumardev0614@gmail.com

In [1]:
import numpy as np
import pandas as pd

In [2]:
A = pd.read_csv("5_a.csv")            # Importing First Dataset to work with.
A

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
...,...,...
10095,1.0,0.665371
10096,1.0,0.607961
10097,1.0,0.777724
10098,1.0,0.846036


In [3]:
# Just Checking Our Dataset is Imbalanced or not.
A.y.value_counts()

1.0    10000
0.0      100
Name: y, dtype: int64

In [4]:
# function to calculate Confusion Matrix, F1 score and Accuracy.

def basic_matrices(df):                 
    N = df.y.value_counts()[0]          # N = total actual -ve points(0s) in dataset.
    P = df.y.value_counts()[1]          # P = total actual +ve points(1s) in dataset.
    
    df['y_pred'] = df.proba.apply(lambda x: 0 if x<=0.5 else 1)  # Predicting Y values using 0.5 as a threshold.
    # Any point having proba score < 0.5 will be predicted as 0, else 1.

    TN = df[ (df.y == 0) & (df.y_pred == 0) ].shape[0] # Filtering df where actual Y and predicted Y' both are 0. Then counting 
    FP = N - TN                                                  # no. of rows in that df. Means counting True Negative values. 
    TP = df[ (df.y == 1) & (df.y_pred == 1) ].shape[0]
    FN = P - TP
    conf_mat = np.array([[TN, FN], [FP, TP]])          # Creating Confusion Matrix. 
    
    precision = TP/ (FP+TP)
    recall = TP/P
    F1 = (2*precision*recall)/(precision + recall)
    
    Accuracy = (TN+TP)/(N+P)                           # Acc = Correctly predicted points / total points in dataset.
    
    return conf_mat, F1, Accuracy

In [5]:
# Function to calculate tpr and fpr values for each threshold value.

def auc_score(data):
    data.sort_values(by='proba', inplace=True)  # sorting Dataset by probability score.
    temp = data.copy()                          # Need another copy of datset to calculate predicted Y' for each threshold.
    tpr = []                                    # To save tpr value for each threshold value.
    fpr = []                                    # To save tpr value for each threshold value.
    N = temp.y.value_counts()[0]
    P = temp.y.value_counts()[1]
    
    for index, row in data.iterrows():      # Iterating through each prob. score value. For a.csv this will run for 10100 times.
        # Because our a.csv has 10100 differnt prob. values. Means we have 10100 threshold values. And for each threshold value
        # We will calculate Pred Y' values. For thsese Y' values we will get TPR and FPR.
        
        threshold = row['proba']            # assigning each prob. score as thresold value for each itaration. 
        temp['y_pred'] = temp.proba.apply(lambda x: 0 if x<=threshold else 1) # predicting Y' and saving values in temp 
        # dataframe as column "y_pred". This "y_pred" column values will change in each itteration for each threshold.
        # Because for each threshold we will get differnt Y' values.
        
        # Now we will calc. TP,FP,TN,FN,con_mat to get TPR,FPR For actual Y and newly predicted Y' values.
        TN = temp[ (temp.y == 0) & (temp.y_pred == 0) ].shape[0]    
        FP = N - TN
        TP = temp[ (temp.y == 1) & (temp.y_pred == 1) ].shape[0]
        FN = P - TP
        conf_mat = np.array([[TN, FN], [FP, TP]])
        
        tpr.append(TP/P)        # Saving calculated TPR and FPR we got in each itteration.
        fpr.append(FP/N)
        
    return tpr, fpr

In [6]:
tpr_array, fpr_array = auc_score(A.copy())      # Calculating TPR and FPR for a.csv

In [7]:
AUC = abs(np.trapz(tpr_array, fpr_array))       # Passing above calculated TPR and FPR values to calculate AUC score.

In [8]:
Confusion_Matrix, F1_score, Acc = basic_matrices(A.copy())      # Calculating remaining metrices for a.csv

### 1: Metrices For Dataset a.csv

In [9]:
print("F1_score =", F1_score)
print("Accuracy =", Acc)
print("AUC Score =", AUC)
print()
print("Confusion_Matrix:")
Confusion_Matrix

F1_score = 0.9950248756218906
Accuracy = 0.9900990099009901
AUC Score = 0.48829900000000004

Confusion_Matrix:


array([[    0,     0],
       [  100, 10000]], dtype=int64)

### 2: Metrices For Dataset b.csv

In [10]:
B = pd.read_csv("5_b.csv")
B

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648
...,...,...
10095,0.0,0.474401
10096,0.0,0.128403
10097,0.0,0.499331
10098,0.0,0.157616


In [11]:
b_Confusion_Matrix, b_F1_score, b_Acc = basic_matrices(B.copy())

In [12]:
b_tpr_array, b_fpr_array = auc_score(B.copy())

In [13]:
b_AUC = abs(np.trapz(b_tpr_array, b_fpr_array))
b_AUC

0.937657

In [14]:
print("F1_score =", b_F1_score)
print("Accuracy =", b_Acc)
print("AUC Score =", b_AUC)
print()
print("Confusion_Matrix:")
b_Confusion_Matrix

F1_score = 0.2791878172588833
Accuracy = 0.9718811881188119
AUC Score = 0.937657

Confusion_Matrix:


array([[9761,   45],
       [ 239,   55]], dtype=int64)

### 3: Best threshold of probability which gives lowest values of metric A

In [15]:
C = pd.read_csv("5_c.csv")
C

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579
...,...,...
2847,1,0.491663
2848,1,0.292109
2849,1,0.659161
2850,1,0.456265


In [16]:
def threshold_for_min_A(data):
    data.sort_values(by='prob', inplace=True)
    temp = data.copy()
    A_metric = 100000000
    N = temp.y.value_counts()[0]
    P = temp.y.value_counts()[1]
    
    for index, row in data.iterrows(): 
        threshold = row['prob']
        temp['y_pred'] = temp.prob.apply(lambda x: 0 if x<threshold else 1)
        
        TN = temp[ (temp.y == 0) & (temp.y_pred == 0) ].shape[0]
        FP = N - TN
        TP = temp[ (temp.y == 1) & (temp.y_pred == 1) ].shape[0]
        FN = P - TP
        
        temp_A = (500*FN) + (100*FP)      # Calculating Value of A for current threshold.
        if temp_A < A_metric:             # if this current A is smaller than previous smallest A then 
            A_metric = temp_A             # A = new smallest A
            best_threshold = threshold    # Also save the threshold for this new smallest A.
            
    return best_threshold

In [17]:
threshold = threshold_for_min_A(C.copy())

In [18]:
print("Best threshold value for Metric A to be Minimum =", threshold)

Best threshold value for Metric A to be Minimum = 0.2300390278970873


### 4: Metrices For Regression Dataset. 

In [19]:
D = pd.read_csv("5_d.csv")
D

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0
...,...,...
157195,87.0,83.0
157196,97.0,86.0
157197,106.0,93.0
157198,105.0,101.0


In [20]:
D['ei_square'] = (D['y'] - D['pred'])**2

# For each row in dataframe D, calculate ei_square = (Y - Y')^2

In [21]:
D

Unnamed: 0,y,pred,ei_square
0,101.0,100.0,1.0
1,120.0,100.0,400.0
2,131.0,113.0,324.0
3,164.0,125.0,1521.0
4,154.0,152.0,4.0
...,...,...,...
157195,87.0,83.0,16.0
157196,97.0,86.0,121.0
157197,106.0,93.0,169.0
157198,105.0,101.0,16.0


In [22]:
print("Mean Square ERROR" , D.ei_square.mean())

Mean Square ERROR 177.16569974554707


#### MAPE

In [23]:
D.y.value_counts()[0]

5717

In [24]:
# Because we have 5717 ai as 0. So we have to use modified MADE.
D['abs_ei'] = abs(D['y'] - D['pred'])

# For each row in D, calculate abs_ei = |Y - Y'|

In [25]:
D

Unnamed: 0,y,pred,ei_square,abs_ei
0,101.0,100.0,1.0,1.0
1,120.0,100.0,400.0,20.0
2,131.0,113.0,324.0,18.0
3,164.0,125.0,1521.0,39.0
4,154.0,152.0,4.0,2.0
...,...,...,...,...
157195,87.0,83.0,16.0,4.0
157196,97.0,86.0,121.0,11.0
157197,106.0,93.0,169.0,13.0
157198,105.0,101.0,16.0,4.0


In [26]:
MAPE = sum(D['abs_ei']) / sum(D['y'])
print("MAPE =", MAPE)

MAPE = 0.1291202994009687


#### R Square

In [27]:
y_mean = D.y.mean()
y_mean

66.56208651399491

In [28]:
D["y_minus_ymean"] = (D['y'] - y_mean)**2

# For each row in Dataframe D, Calculate y_minus_ymean = (Y - avrg(Y))^2

In [29]:
D

Unnamed: 0,y,pred,ei_square,abs_ei,y_minus_ymean
0,101.0,100.0,1.0,1.0,1185.969885
1,120.0,100.0,400.0,20.0,2855.610598
2,131.0,113.0,324.0,18.0,4152.244694
3,164.0,125.0,1521.0,39.0,9494.146985
4,154.0,152.0,4.0,2.0,7645.388715
...,...,...,...,...,...
157195,87.0,83.0,16.0,4.0,417.708308
157196,97.0,86.0,121.0,11.0,926.466577
157197,106.0,93.0,169.0,13.0,1555.349020
157198,105.0,101.0,16.0,4.0,1477.473193


In [30]:
SS_total = sum(D['y_minus_ymean'])
SS_total

638161080.035662

In [31]:
SS_res = sum(D['ei_square'])
SS_res

27850448.0

In [32]:
R_square = 1 - (SS_res / SS_total)
print("R Square =", R_square)

R Square = 0.9563582786990964
