# Precision and Recall
### by: Kyla S. Ronquillo

1. Write a function that computes the precision and recall measures.
2. Check the formula below for the Precision and Recall.
3. Check the LabeledTestData.csv for the correct labels.
4.  Compare it to your SurnameResultData.csv
5.  Output a file SurnamePrecisionRecall.csv, same as your that contains new columns at the beginning: measure (values either TP, TN, FP, FN) and correct_label (from the LabeledTestData.csv)

In [22]:
import pandas as pd
import numpy as np

In [23]:
# Define file paths
LabeledDataSet = "/kaggle/input/dataset-of-messages/LabeledTestData.csv"
ResultData_NB = "/kaggle/input/new-results/RonquilloResultData.csv"
ResultData_RF = "/kaggle/input/new-results/random_forest_resultss.csv"

#-----------------------------
df = pd.read_csv(ResultData_RF)

# Rename the column
df.rename(columns={'predicted_label': 'label'}, inplace=True)

# Save the updated DataFrame in a writable location
updated_file_path = "/kaggle/working/random_forest_results_updated.csv"
df.to_csv(updated_file_path, index=False)

# Update the variable to point to the new file
ResultData_RF = updated_file_path

In [24]:
def compute_precision_recall(labeled_file, result_file, output_file):
    labeled_df = pd.read_csv(labeled_file, encoding='ISO-8859-1')
    result_df = pd.read_csv(result_file, encoding='ISO-8859-1')
    
    print("Labeled Data Columns:", labeled_df.columns)
    print("Result Data Columns:", result_df.columns)
    
    if 'label' not in labeled_df.columns or 'label' not in result_df.columns:
        raise ValueError(f"Expected column 'label' not found.\n"
                         f"Labeled Data Columns: {labeled_df.columns}\n"
                         f"Result Data Columns: {result_df.columns}")
    
    result_df.rename(columns={'label': 'predicted'}, inplace=True)
    merged_df = labeled_df.copy()
    merged_df['predicted'] = result_df['predicted']
    
    conditions = [
        (merged_df['label'] == 'spam') & (merged_df['predicted'] == 'spam'),
        (merged_df['label'] == 'ham') & (merged_df['predicted'] == 'ham'),
        (merged_df['label'] == 'ham') & (merged_df['predicted'] == 'spam'),
        (merged_df['label'] == 'spam') & (merged_df['predicted'] == 'ham')
    ]
    values = ['TP', 'TN', 'FP', 'FN']
    
    merged_df['measure'] = np.select(conditions, values, default='Unknown')
    
    TP = (merged_df['measure'] == 'TP').sum()
    FP = (merged_df['measure'] == 'FP').sum()
    FN = (merged_df['measure'] == 'FN').sum()
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    
    merged_df.rename(columns={'label': 'correct_label'}, inplace=True)
    merged_df[['measure', 'correct_label', 'predicted']].to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")

In [25]:
# Execute the function with predefined file paths
compute_precision_recall(LabeledDataSet, ResultData_NB, "RonquilloPrecisionRecall_NB.csv")
compute_precision_recall(LabeledDataSet, ResultData_RF, "RonquilloPrecisionRecall_RF.csv")


Labeled Data Columns: Index(['label', 'message'], dtype='object')
Result Data Columns: Index(['message', 'label'], dtype='object')
Precision: 0.8333
Recall: 0.9430
Output saved to RonquilloPrecisionRecall_NB.csv
Labeled Data Columns: Index(['label', 'message'], dtype='object')
Result Data Columns: Index(['message', 'actual_label', 'label'], dtype='object')
Precision: 1.0000
Recall: 0.7895
Output saved to RonquilloPrecisionRecall_RF.csv
