# Ability of Artificial Intelligence to Identify Self-Reported Race in Chest X-Ray Using Pixel Intensity Counts
## Transform images into Pixel Counts and Visualize Data

In [None]:
#imports
from PIL import Image
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import pydicom
import scipy.stats as ss


pd.set_option("display.max_columns", None)

#path of images in png format
images_path = 'H:/path/to_image_folder/'

#path of CSV
#expects at least labels of 'File' and 'Race'
#will keep all other labels
images_csv = pd.read_csv('H:/path/images_and_metadata.csv', index_col=0, header=0)
images_csv.head


In [None]:
#add pixels to the other metadata in the csv
pixels = list(range(0,256))
freq_df = pd.concat([images_csv, pd.DataFrame(columns=pixels)])
freq_df[pixels] = 0
#loop through each item of CSV, open image, store pixel values
for index, row in freq_df.iterrows():
    file = row['File']
    #load the image and convert into numpy array
    #assumes image is grayscale.
    #confim images are grayscale by opening with Image.open, then print img.format. Should return 'L'
    #images that can't be opened are dropped
    try:
        img = Image.open(images_path+'/'+file)
        temp = np.asarray(img).flatten()
        img.close()
        #get all the counts greyscale values, zip together
        unique, counts = np.unique(temp, return_counts=True)
        freq = dict(zip(unique, counts))
        for j in freq:
            freq_df.at[index,j] = freq[j]
    except:
        print('dropping: ' + file)
        freq_df.drop(index, inplace=True)

freq_df.head(6)

### Save/load a CSV
Recommended to save a CSV after the initial run, as it's much faster to load it again later

In [None]:
# save to a csv
freq_df.to_csv('frequencies.csv')

In [None]:
# read from a saved csv
freq_df = pd.read_csv('frequencies.csv', index_col=0, header=0)
races = freq_df['race'].unique()
freq_df

### Pixel Percent Per Image
This normalizes to amount of pixels present, allowing for different resolutions of images to be directly compared.

In [None]:
# Keep original, copy to new
freq_percent_df = freq_df.copy(deep=True)

# Drop the 0 column to remove all pure non-image black space often caused by image rotations
freq_percent_df.drop('0',axis=1,inplace=True)

# Seperate out the numeric columns
freq_percent_df_num = freq_percent_df.select_dtypes(include=[np.number])

# Converting percentages instead of total pixel counts
freq_percent_df_num = freq_percent_df_num.div(freq_df.sum(axis=1, numeric_only=True), axis=0)

# Rejoin the numeric columns to the metadata columns
freq_percent_df[freq_percent_df_num.columns] = freq_percent_df_num


#calculate the percent frequency average distributions by race
temp_dfs = []
for race in races:
    temp = pd.DataFrame(freq_percent_df.loc[freq_df['race'] == race].iloc[:,:-1].mean()).T
    temp.index = [race]
    temp_dfs.append(temp)

freq_percent_df_race = pd.concat(temp_dfs)
print(freq_percent_df_race)

In [None]:
#plot it
fig, ax = plt.subplots(figsize=(100,50))
for i in range(len(freq_percent_df_race)):
    plt.plot(freq_percent_df_race.iloc[i, 0:], label=freq_percent_df_race.index[i])
    
plt.legend(loc='upper right', prop={'size': 65})

### ANOVA Tests
Note - ANOVA is not appropriate for data as values are highly correlated
MANOVA with Bonferroni Corrections is used for tests
ANOVA is displayed on web site, so remains here to demonstrate how that was accomplished

In [None]:
#p value significance = 0.05
#change display so pd can display all columns
pd.set_option('display.max_columns', None)

#check for significance
cols = freq_percent_df.columns[:-1]

#samples is a set of all image pixel percentages grouped by race
samples = [d[cols] for _, d in freq_percent_df.groupby('race')]

#one way ANOVA
f_vals, p_vals = ss.f_oneway(*samples)

#reorganize F and P for count and display
f_df = pd.DataFrame(f_vals).transpose()
f_df.index = ['f']
print('Count f > 2:', f_df.where(f_df > 10).count(axis=1)[0])
p_df = pd.DataFrame(p_vals).transpose()
p_df.index = ['p']
print('Count p < 0.05:', p_df.where(p_df < 0.0001).count(axis=1)[0])
f_p_df = pd.concat([f_df, p_df])

#set p-value and f-value of interest
def color_by_number(val):
    color = 'red'
    if val < 0.05 or val > 2:
            color = 'green' 
    return 'color: %s' % color

f_p_df.style.applymap(color_by_number)