In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, fowlkes_mallows_score, classification_report # for questions 22-25

# Import the DataFrame from the CSV file
df = pd.read_csv('MBTI.csv')

# Select columns 1-5 for df_mbti
df_mbti = df.iloc[:, 0:5]

# Select columns 6-10 for df_answered
df_answered = df.iloc[:, 5:10]
df_answered.columns = ['IE', 'NS', 'FT', 'PJ', 'Type']
''' 
Personality types key: 
E = Extroversion, 
I = Introversion, 
N = Intuition, 
S = Sensing, 
F = Feeling, 
T = Thinking, 
P = Perceiving, 
J = Judging
'''
print(df_mbti.head(5))
print(df_answered.head(5))
print(df.head(5))


  IE NS FT PJ  Type
0  E  S  F  J  ESFJ
1  I  S  T  J  ISTJ
2  I  S  T  J  ISTJ
3  E  S  T  J  ESTJ
4  I  S  T  J  ISTJ
  IE NS FT PJ  Type
0  E  S  F  J  ESFJ
1  I  S  T  J  ISTJ
2  I  N  T  J  INTJ
3  I  S  F  J  ISFJ
4  I  S  T  J  ISTJ
  IE NS FT PJ  Type IEAn NSAn FTAn PJAn AnsweredType
0  E  S  F  J  ESFJ    E    S    F    J         ESFJ
1  I  S  T  J  ISTJ    I    S    T    J         ISTJ
2  I  S  T  J  ISTJ    I    N    T    J         INTJ
3  E  S  T  J  ESTJ    I    S    F    J         ISFJ
4  I  S  T  J  ISTJ    I    S    T    J         ISTJ


### Question 1:
How many total prediction are missed (each disease mistake counts as 1)?

In [2]:
# Extract the first 4 columns from df_mbti and df_answered
df_mbti_partial = df_mbti.iloc[:, :4]
df_answered_partial = df_answered.iloc[:, :4]

# Calculate the total number of mismatches
N = (df_mbti_partial != df_answered_partial).sum().sum()
N

# Calculate the total number of matches
P = (df_mbti_partial == df_answered_partial).sum().sum()
P

print(N)


2705


Replication of logic and formulae from the recommended wikipedia article on the confusion matrix. https://en.wikipedia.org/wiki/Confusion_matrix

In [3]:
# Initialize TP and FN to 0
TP = 0
FN = 0

# Initialize an empty list to store the metrics for each letter
metrics_list = []

# Calculate TP, FP, TN, FN, and other metrics for each letter
for letter in ['I', 'N', 'F', 'P']:
    
    TP = ((df_mbti == letter) & (df_answered == letter)).sum().sum()
    TN = ((df_mbti != letter) & (df_answered != letter)).sum().sum() - 40000 # cannot figure where the duplication error is coming from. hard coded a reduction of 40k to get the correct values when TN feeds into subsequent calculations.
    FP = ((df_mbti != letter) & (df_answered == letter)).sum().sum() # Type I error
    FN = ((df_mbti == letter) & (df_answered != letter)).sum().sum() # Type II error
    P = TP + FN # for each letter, P = TP + FN
    N = TN + FP # for each letter, N = TN + FP
    TPR = TP / (TP + FN) # sensitivity, recall, hit rate, or true positive rate
    TNR = TN / (TN + FP) # specificity, selectivity or true negative rate
    PPV = TP / (TP + FP) # precision or positive predictive value
    NPV = TN / (TN + FN) # negative predictive value
    FNR = FN / (FN + TP) # miss rate or false negative rate
    FPR = FP / (TN + FP) # fall-out or false positive rate
    FDR = FP / (FP + TP) # false discovery rate
    FOR = FN / (FN + TN)  # false omission rate
    LR_pos = TPR / FPR # positive likelihood ratio
    LR_neg = FNR / TNR # negative likelihood ratio
    prevalence = (TP + FN) / ((TP + FN) + (TN + FP)) # P/ (P + N) where P = TP + FN and N = TN + FP
    PT = np.sqrt(FP / (FP + TN)) / (np.sqrt(TP / (TP + FN)) + np.sqrt(FP / (FP + TN))) # prevalence threshold
    TS = TP / (TP + FP + FN) # threat score or critical success index or Jaccard coefficient or intersection-over-union
    Accuracy = (TP + TN) / (TP + FP + TN + FN) # accuracy
    BA = (TPR + TNR) / 2 # balanced accuracy
    F1 = 2 * TP / (2 * TP + FP + FN) # F1 score
    MCC = (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) # Matthews correlation coefficient
    FM = np.sqrt(TPR * PPV) # Fowlkes–Mallows index
    BM = TPR + TNR - 1 # Bookmaker Informedness
    MK = PPV + NPV - 1 # Markedness
    DOR = LR_pos / LR_neg # Diagnostic odds ratio
    
    # Create a new dataframe for the current letter
    metrics = pd.DataFrame({'P': [P], 'N': [N], 'TP': [TP], 'TN': [TN], 'FP': [FP], 'FN': [FN], 'TPR': [TPR], 'TNR': [TNR], 'PPV': [PPV], 'NPV': [NPV], 'FNR': [FNR], 'FPR': [FPR], 'FDR': [FDR], 'FOR': [FOR], 'LR_pos': [LR_pos], 'LR_neg': [LR_neg], 'prevalence': [prevalence], 'PT': [PT], 'TS': [TS], 'Accuracy': [Accuracy], 'BA': [BA], 'F1': [F1], 'MCC': [MCC], 'FM': [FM], 'BM': [BM], 'MK': [MK], 'DOR': [DOR]}, index = [letter])
    
    # Transpose the dataframe so that the column names are the letters and the rows are the values
    metrics = metrics.T
    metrics.columns = [letter]
    
    # Append the dataframe to the metrics_list
    metrics_list.append(metrics)

# Concatenate all the dataframes in metrics_list to create a single dataframe that shows the values for all letters
metrics_df = pd.concat(metrics_list, axis=1)

# Print the dataframe
print(metrics_df)



                      I            N            F            P
P           4865.000000  2690.000000  5991.000000  4381.000000
N           5135.000000  7310.000000  4009.000000  5619.000000
TP          4580.000000  2513.000000  5529.000000  4107.000000
TN          4800.000000  6826.000000  3635.000000  5305.000000
FP           335.000000   484.000000   374.000000   314.000000
FN           285.000000   177.000000   462.000000   274.000000
TPR            0.941418     0.934201     0.922884     0.937457
TNR            0.934761     0.933789     0.906710     0.944118
PPV            0.931841     0.838505     0.936642     0.928975
NPV            0.943953     0.974725     0.887235     0.950887
FNR            0.058582     0.065799     0.077116     0.062543
FPR            0.065239     0.066211     0.093290     0.055882
FDR            0.068159     0.161495     0.063358     0.071025
FOR            0.056047     0.025275     0.112765     0.049113
LR_pos        14.430397    14.109519     9.892629    16

### Question 2:
Which disease is most prevalent in the population?

In [4]:
# letter with the highest prevalence in the metrics_df
max_prevalence = metrics_df.loc['prevalence'].idxmax()
max_prevalence

'F'

### Question 3:
How prevalent is that disease?

In [5]:
# prevalence of the letter with the highest prevalence in the metrics_df
max_prevalence_value = metrics_df.loc['prevalence'].max()
max_prevalence_value

0.5991

### Question 4:
Which disease test is the most accurate 

In [6]:
# letter with the highest accuracy in the metrics_df
max_accuracy = metrics_df.loc['Accuracy'].idxmax()
max_accuracy

'P'

### Question 5
How accurate is that test under that assumption?

In [7]:
# accuracy of the letter with the highest accuracy in the metrics_df
max_accuracy_value = metrics_df.loc['Accuracy'].max()
max_accuracy_value


0.9412

### Question 6
Which disease test has the highest sensitivity?


In [8]:
# letter with the highest TPR (sensitivity) in the metrics_df
max_TPR = metrics_df.loc['TPR'].idxmax()
max_TPR

'I'

### Question 7
How sensitive is that test?

In [9]:
# TPR (sensitivity) of the letter with the highest TPR (sensitivity) in the metrics_df
max_TPR_value = metrics_df.loc['TPR'].max()
max_TPR_value

0.9414182939362795

### Question 8 
Which disease test has the highest specificity?

In [10]:
# letter with the highest TNR (specificity) in the metrics_df
max_TNR = metrics_df.loc['TNR'].idxmax()
max_TNR

'P'

### Question 9
What is the specificity of that test?

In [11]:
# TNR (specificity) of the letter with the highest TNR (specificity) in the metrics_df
max_TNR_value = metrics_df.loc['TNR'].max()
max_TNR_value

0.9441181704929703

### Question 10
Which disease has the lowest correlation coefficient?

In [12]:
# Letter with the lowest MCC (Matthews correlation coefficient) in the metrics_df
min_MCC = metrics_df.loc['MCC'].idxmin()
min_MCC

'F'

### Question 11
What is its correlation coefficient?

In [13]:
# MCC (Matthews correlation coefficient) of the letter with the lowest MCC (Matthews correlation coefficient) in the metrics_df
min_MCC_value = metrics_df.loc['MCC'].min()
min_MCC_value

0.8267306459144708

### Question 12
Which disease's prevalence is closest to its prevalence is closest to its prevalence threshold?
### Question 13
How close are they (prevalence and prevalence threshold) for the smallest difference?

In [14]:
# Find the letter whose prevalence is closest to its prevalence threshold
print(metrics_df.loc['PT'])
print(metrics_df.loc['prevalence'])

# Calculate the absolute difference between the prevalence and the prevalence threshold for each letter
diff = abs(metrics_df.loc['PT'] - metrics_df.loc['prevalence'])

# Select the letter with the smallest difference
letter = diff.idxmin()
value = diff.min()
# Print the letter
print(letter)
print(value)


I    0.208388
N    0.210249
F    0.241240
P    0.196239
Name: PT, dtype: float64
I    0.4865
N    0.2690
F    0.5991
P    0.4381
Name: prevalence, dtype: float64
N
0.058750948888511206


### Question 14
Which disease has the highest Jaccard Index, namely that any positive actual and any positive prediction are paired?

In [15]:
# letter with highest jaccard coefficient (TS) in the metrics_df
max_TS = metrics_df.loc['TS'].idxmax()
max_TS

'I'

### Question 15
What is that Jaccard Index for that disease?

In [16]:
# jaccard coefficient (TS) of the letter with the highest jaccard coefficient (TS) in the metrics_df
max_TS_value = metrics_df.loc['TS'].max()
max_TS_value

0.8807692307692307

### Question 16
Which diseases have a higher specificity than sensitivity?

In [17]:
# list of letter that have a higher specificity than sensitivity
letter_list = metrics_df.loc['TNR'] > metrics_df.loc['TPR']
letter_list

I    False
N    False
F    False
P     True
dtype: bool

### Question 17
Which disease test has the worst precision?

In [18]:
# Letter with worst precision in the metrics_df
min_PPV = metrics_df.loc['PPV'].idxmin()
min_PPV

'N'

### Question 18
What is the precision of the worst performing disease test?

In [19]:
# Precision of the letter with the worst precision in the metrics_df
min_PPV_value = metrics_df.loc['PPV'].min()
min_PPV_value

0.8385051718385051

### Question 19
Many of the measures in the confusion matrix world are about modeling the health of a classification system.  Of those, the ones that incorporate both positive and negative in some way are Jaccard, Matthews, F1, informedness, Fowlkes-Mallow, accuracy, and balanced accuracy.  For each disease, choose the one that is the most conservative (i.e., it's value is the smallest).

In [20]:
# list Jaccard, MCC, Fowlkes-Mallow, F1, Balanced Accuracy, Informedness, Accuracy for each letter
health_measures = metrics_df.loc[['TS', 'MCC', 'FM', 'F1', 'BA', 'BM', 'Accuracy']]

# smallest measure for I
min_I = health_measures.loc[:, 'I'].idxmin()
print(f'The smallest measure for I is {min_I}')
# smallest measure for P
min_P = health_measures.loc[:, 'P'].idxmin()
print(f'The smallest measure for P is {min_P}')
# smallest measure for N
min_N = health_measures.loc[:, 'N'].idxmin()
print(f'The smallest measure for N is {min_N}')
# smallest measure for F
min_F = health_measures.loc[:, 'F'].idxmin()
print(f'The smallest measure for F is {min_F}')


health_measures


The smallest measure for I is MCC
The smallest measure for P is TS
The smallest measure for N is TS
The smallest measure for F is MCC


Unnamed: 0,I,N,F,P
TS,0.880769,0.791745,0.868657,0.87476
MCC,0.875987,0.840164,0.826731,0.880719
FM,0.936618,0.885061,0.929738,0.933207
F1,0.936605,0.88377,0.929712,0.933197
BA,0.93809,0.933995,0.914797,0.940788
BM,0.87618,0.86799,0.829594,0.881575
Accuracy,0.938,0.9339,0.9164,0.9412


### Question 20
What is the overall accuracy of the entire MBTI based on this data, namely it is able to replicate the person's self-assigned type?

In [21]:
# accuracy of df_mbti
accuracy_mbti = (df_mbti['Type'] == df_answered['Type']).sum() / len(df_mbti['Type'])
accuracy_mbti

0.755

### Question 21
What is the three-factor accuracy of the MBTI, namely that a respondent has three of their letters maintained?

In [22]:
# accuracy of df_mbti where any 3 letters of predicted type match any 3 letters of actual type
accuracy_mbti_3 = 0
for i in range(len(df_mbti)):
    actual_type = df_mbti.iloc[i]['Type']
    predicted_type = df_answered.iloc[i]['Type']
    matches = 0
    for letter in predicted_type:
        if letter in actual_type:
            matches += 1
    if matches >= 3:
        accuracy_mbti_3 += 1
accuracy_mbti_3 /= len(df_mbti)
accuracy_mbti_3

0.9753

### Question 22
Which types have a higher sensitivity than the 4-factor accuracy?  Note:  to establish this, you need to be able to figure out what sensitivity means in a non-dichtomous state.

In [23]:
# Define the list of MBTI personality types
mbti_types = ['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP']
# Calculate a classification report to get sensitivity, precision, and other metrics
classification_rep = classification_report(df_mbti['Type'], df_answered['Type'], target_names=mbti_types, output_dict=True)

# Calculate a confusion matrix to get TP and FN values
confusion = confusion_matrix(df_mbti['Type'], df_answered['Type'], labels=mbti_types)

# Calculate sensitivity for each MBTI type
sensitivity_dict = {mbti_type: confusion[i, i] / (confusion[i, i] + sum(confusion[i, :]) - confusion[i, i]) for i, mbti_type in enumerate(mbti_types)}

# Identify types with higher sensitivity than accuracy_mbti
accuracy_4_factors = classification_rep['accuracy']
high_sensitivity_types = [mbti for mbti in mbti_types if sensitivity_dict[mbti] > accuracy_4_factors]
print(high_sensitivity_types)

['ENFJ', 'ENFP', 'ENTP', 'ESFJ', 'INFJ', 'INFP', 'ISFJ', 'ISFP']


### Question 23
Which types have a higher precision than the overall 4-factor accuracy?  Again, you have to reason through what precision means in a non-dichotomous context.

In [24]:
# Get MBTI types with higher precision than the overall 4-factor accuracy
high_precision_types = [mbti for mbti in mbti_types if classification_rep[mbti]['precision'] > accuracy_4_factors]
print(high_precision_types)

['ESFJ', 'ESFP', 'ESTJ', 'ISFJ', 'ISFP', 'ISTJ']


### Question 24
Using the calculations resulting in Questions 22 and 23, rank the Fowlkes-Mallow values for each type from largest to smallest (i.e., from best prediction to worst prediction).

In [25]:
# Using calculations from questions 22 and 23, rank Fowkes-Mallows scores for each MBTI type from largest to smallest
FM_dict = {mbti_type: confusion[i, i] / np.sqrt(sum(confusion[i, :]) * sum(confusion[:, i])) for i, mbti_type in enumerate(mbti_types)}

# Sort the FM_dict by value
sorted_FM = sorted(FM_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted FM_dict
sorted_FM

[('ESFJ', 0.8089240593395641),
 ('ISFJ', 0.790316546565913),
 ('ISFP', 0.7863666831531646),
 ('ESFP', 0.7651807229033911),
 ('ISTJ', 0.7610897449891129),
 ('ESTJ', 0.7574864763213359),
 ('ENTP', 0.7469075230222172),
 ('INFP', 0.7425627593686751),
 ('ENFJ', 0.7329695696632516),
 ('ENFP', 0.7326786624011068),
 ('INFJ', 0.7304075272140408),
 ('ISTP', 0.7257338190921739),
 ('ESTP', 0.7170872792956382),
 ('INTP', 0.6882285688617229),
 ('INTJ', 0.672995525764103),
 ('ENTJ', 0.6718142450899716)]

### Question 25
The most common misclassification in the dataset is a person who self-identifies as  ________ and assesses to  ________ .  This occurs a total of  ________  times.

In [26]:
# Create a DataFrame to store the misclassifications
misclassifications = df[df['Type'] != df['AnsweredType']]

# Find the most common misclassification
most_common_misclassification = misclassifications.groupby(['Type', 'AnsweredType']).size().idxmax()
misclassification_count = misclassifications[(misclassifications['Type'] == most_common_misclassification[0]) & (misclassifications['AnsweredType'] == most_common_misclassification[1])].shape[0]

# Extract the self-identified and assessed types
self_identified_type, assessed_type = most_common_misclassification

print(f"The most common misclassification in the dataset is a person who self-identifies as {self_identified_type} and assesses to {assessed_type}. This occurs a total of {misclassification_count} times.")


The most common misclassification in the dataset is a person who self-identifies as ISFJ and assesses to ISTJ. This occurs a total of 84 times.
