In [15]:
import pandas as pd
import plotly.graph_objects as go

df_all = pd.read_csv('data1.csv')
df_all.head()

Unnamed: 0,SL.NO,NAME,AGE,GENDER,UHID,PRESENTATION,CLINICAL DIAGNOSIS,DOS,FNAC,Stage_2_class,HPE,Stage_3,Unnamed: 12,CORRELATION YES/NO,Predicted,Unnamed: 15
0,1,RATHI SHEDTHI,67.0,F,1481122,SOLITARY NODULE WITH LYMPH NODES,CARCINOMA,27/3/2023,PAPILLARY CARCINOMA B-IV,4,HASHIMOTO'S THYROIDITIS,THYROIDITIS,0,N,TN,93.0
1,2,SHABEENA,22.0,F,1482969,SOLITARY NODULE - SUDDEN INCREASE IN SIZE,CARCINOMA,29/3/2024,FOLLICULAR NEOPLASM B- IV,4,MULTINODULAR GOITRE,BENIGN,1,N,TN,
2,3,VINAY K,31.0,M,1400945,SOLITARY NODULE,BENIGN,29/3/2023,NODULAR GOITRE B-II,2,GRAVE'S DISEASE,THYROIDITIS,1,N,TN,
3,4,REKHA,35.0,F,1484783,SOLITARY NODULE,BENIGN,8/4/2023,"NODULAR GOITRE, B- II",2,MULTINODULAR GOITRE,BENIGN,1,Y,TN,
4,5,SALMA,38.0,F,1485303,SOLITARY NODULE,BENIGN,15/4/2023,"NODULAR GOITRE, B - II",2,MULTINODULAR GOITRE,BENIGN,1,Y,TN,


In [16]:
print(df_all['Stage_2_class'].unique())
print(df_all['Stage_3'].unique())

[4 2 5 3 1]
['THYROIDITIS' 'BENIGN' 'CARCINOMA' 'BENIGN **' ' BENIGN']


In [17]:
df_all['pred'] = df_all['Stage_2_class'].map({1:0, 2:0, 3:0, 4:0, 5:1})
df_all['gt'] = df_all['Stage_3'].map({'THYROIDITIS':0, 'BENIGN':0, 'CARCINOMA':1, ' BENIGN':0, 'BENIGN **':0})

In [18]:
print(df_all['gt'].unique())
print(df_all['pred'].unique())

[0 1]
[0 1]


In [19]:
import pandas as pd
from sklearn.metrics import confusion_matrix

def calculate_confusion_matrix_df(ground_truth_col, predicted_col, data):
  """
  Calculates confusion matrix and returns TP, TN, FP, FN from a DataFrame.

  Args:
      ground_truth_col (str): Name of the column containing ground truth labels.
      predicted_col (str): Name of the column containing predicted labels.
      data (pandas.DataFrame): The DataFrame containing ground truth and prediction columns.

  Returns:
      dict: Dictionary containing TP, TN, FP, FN values.
  """
  # Ensure columns exist in DataFrame
  if ground_truth_col not in data.columns or predicted_col not in data.columns:
    raise ValueError("Ground truth and prediction columns don't exist in the DataFrame.")

  # Get ground truth and predicted labels as NumPy arrays
  ground_truth = data[ground_truth_col].to_numpy()
  predictions = data[predicted_col].to_numpy()

  # Create the confusion matrix
  cm = confusion_matrix(ground_truth, predictions)

  # Extract values from confusion matrix
  TP = cm[1, 1]
  TN = cm[0, 0]
  FP = cm[0, 1]
  FN = cm[1, 0]

  # Return results as a dictionary
  return {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

In [20]:
# Example usage
#data = pd.DataFrame({'ground_truth': [0, 1, 0, 1, 0], 'prediction': [0, 0, 1, 1, 0]})
results = calculate_confusion_matrix_df('gt', 'pred', df_all)

print(f"True Positive (TP): {results['TP']}")
print(f"True Negative (TN): {results['TN']}")
print(f"False Positive (FP): {results['FP']}")
print(f"False Negative (FN): {results['FN']}")

True Positive (TP): 10
True Negative (TN): 90
False Positive (FP): 3
False Negative (FN): 3


In [21]:
contingency_table = pd.crosstab(df_all['gt'], df_all['pred'])
contingency_table

pred,0,1
gt,Unnamed: 1_level_1,Unnamed: 2_level_1
0,90,3
1,3,10


In [22]:
from scipy.stats import chi2_contingency

chi2_statistic, p_value, degrees_of_freedom, expected_frequencies = chi2_contingency(contingency_table)
print("Chi-squared statistic:", chi2_statistic)
print("p-value:", p_value)
print(degrees_of_freedom, expected_frequencies)

Chi-squared statistic: 50.926203460262535
p-value: 9.590469137412312e-13
1 [[81.59433962 11.40566038]
 [11.40566038  1.59433962]]


In [23]:
import pandas as pd
from sklearn.metrics import confusion_matrix

def calculate_accuracy(ground_truth_col, predicted_col, data):
  """
  Calculates overall accuracy from a DataFrame containing ground truth and predicted labels.

  Args:
      ground_truth_col (str): Name of the column containing ground truth labels.
      predicted_col (str): Name of the column containing predicted labels.
      data (pandas.DataFrame): The DataFrame containing ground truth and prediction columns.

  Returns:
      float: Overall accuracy as a percentage.
  """
  # Ensure columns exist in DataFrame
  if ground_truth_col not in data.columns or predicted_col not in data.columns:
    raise ValueError("Ground truth and prediction columns don't exist in the DataFrame.")

  # Get ground truth and predicted labels as NumPy arrays
  ground_truth = data[ground_truth_col].to_numpy()
  predictions = data[predicted_col].to_numpy()

  # Create confusion matrix
  cm = confusion_matrix(ground_truth, predictions)

  # Calculate total number of data points
  total = sum(sum(row) for row in cm)

  # Calculate number of correctly classified data points
  correct = cm[0, 0] + cm[1, 1]

  # Calculate overall accuracy
  accuracy = correct / total * 100

  return accuracy

# Example usage
#data = pd.DataFrame({'ground_truth': [0, 1, 0, 1, 0], 'prediction': [0, 0, 1, 1, 0]})
data = df_all
accuracy = calculate_accuracy('gt', 'pred', data)

print(f"Overall Accuracy: {accuracy:.2f}%")

Overall Accuracy: 94.34%


In [24]:
from sklearn.metrics import confusion_matrix

def calculate_weighted_accuracy(ground_truth_col, predicted_col, data, class_weights):
  """
  Calculates weighted accuracy from a DataFrame using class weights.

  Args:
      ground_truth_col (str): Name of the column containing ground truth labels.
      predicted_col (str): Name of the column containing predicted labels.
      data (pandas.DataFrame): The DataFrame containing ground truth and prediction columns.
      class_weights (dict): Dictionary containing weights for each class label.

  Returns:
      float: Weighted accuracy as a percentage.
  """
  # Ensure columns exist in DataFrame
  if ground_truth_col not in data.columns or predicted_col not in data.columns:
    raise ValueError("Ground truth and prediction columns don't exist in the DataFrame.")

  # Get ground truth and predicted labels as NumPy arrays
  ground_truth = data[ground_truth_col].to_numpy()
  predictions = data[predicted_col].to_numpy()

  # Create confusion matrix
  cm = confusion_matrix(ground_truth, predictions)

  # Ensure class weights have same number of elements as unique classes
  if len(class_weights) != len(set(ground_truth)):
    raise ValueError("Number of class weights must match the number of unique classes.")

  # Calculate weighted sum of correctly classified data points
  correct = 0
  for i in range(len(cm)):
    for j in range(len(cm[0])):
      correct += cm[i, j] * class_weights.get(ground_truth[i], 1)  # Use weight 1 for unseen classes

  # Calculate total number of data points (weighted by class weights)
  total = sum(data[ground_truth_col].value_counts() * class_weights.get(val, 1) for val in data[ground_truth_col].unique())

  # Calculate weighted accuracy
  accuracy = correct / total * 100

  return accuracy

# Example usage (assuming class labels 0 and 1)
#data = pd.DataFrame({'ground_truth': [0, 1, 0, 1, 0], 'prediction': [0, 0, 1, 1, 0]})
data = df_all
class_weights = {0: 77.36, 1: 22.64}  # Class 0 is twice as important

accuracy = calculate_weighted_accuracy('gt', 'pred', data, class_weights)

#print(f"Weighted Accuracy: {accuracy:.2f}%")
print(accuracy)

0     88.173763
1    630.781538
Name: gt, dtype: float64


In [25]:
from sklearn.metrics import confusion_matrix

def calculate_metrics(ground_truth_col, predicted_col, data):
  """
  Calculates sensitivity, specificity, PPV, and NPV from a DataFrame.

  Args:
      ground_truth_col (str): Name of the column containing ground truth labels.
      predicted_col (str): Name of the column containing predicted labels.
      data (pandas.DataFrame): The DataFrame containing ground truth and prediction columns.

  Returns:
      dict: Dictionary containing sensitivity, specificity, PPV, and NPV values.
  """
  # Ensure columns exist in DataFrame
  if ground_truth_col not in data.columns or predicted_col not in data.columns:
    raise ValueError("Ground truth and prediction columns don't exist in the DataFrame.")

  # Get ground truth and predicted labels as NumPy arrays
  ground_truth = data[ground_truth_col].to_numpy()
  predictions = data[predicted_col].to_numpy()

  # Create confusion matrix
  cm = confusion_matrix(ground_truth, predictions)

  # Calculate true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN)
  TP = cm[1, 1]
  TN = cm[0, 0]
  FP = cm[0, 1]
  FN = cm[1, 0]

  # Ensure no division by zero (avoid NaN)
  if TP + FN == 0:
    sensitivity = 0
  else:
    sensitivity = TP / (TP + FN)  # Recall

  if TN + FP == 0:
    specificity = 0
  else:
    specificity = TN / (TN + FP)

  if TP + FP == 0:
    ppv = 0
  else:
    ppv = TP / (TP + FP)  # Precision

  if TN + FN == 0:
    npv = 0
  else:
    npv = TN / (TN + FN)

  # Return dictionary containing metrics
  return {
      "sensitivity": sensitivity,
      "specificity": specificity,
      "ppv": ppv,
      "npv": npv
  }

# Example usage
data = df_all
metrics = calculate_metrics('gt', 'pred', data)

print(f"Sensitivity (Recall): {metrics['sensitivity']*100:.4f}")
print(f"Specificity: {metrics['specificity']*100:.4f}")
print(f"Positive Predictive Value (PPV): {metrics['ppv']*100:.4f}")
print(f"Negative Predictive Value (NPV): {metrics['npv']*100:.4f}")

Sensitivity (Recall): 76.9231
Specificity: 96.7742
Positive Predictive Value (PPV): 76.9231
Negative Predictive Value (NPV): 96.7742
