In [1]:
import pandas as pd
import re
import requests
import warnings
import numpy as np
from scipy.stats import ttest_ind
import scipy.stats as stats
warnings.filterwarnings("ignore")

In [2]:
cols = ['ID', 'STATE_NAME', 'ST_ABBREV', 'ACSTOTPOP',
        'MINORPCT', 'LOWINCPCT', 
        'CANCER', 'RESP', 'DSLPM', 'PM25', 'OZONE', 'PTRAF', 'PRE1960PCT', 'PRMP',
        'PTSDF', 'PNPL', 'PWDIS', 'UST']

df = pd.read_csv('ejscreen_2021.csv')

In [3]:
# Correlation coefficient between low-income percentage
# and minority percentage
dff = df[['LOWINCPCT', 'MINORPCT']].dropna()
stats.pearsonr(dff['LOWINCPCT'], dff['MINORPCT'])

(0.46949110738161276, 0.0)

In [4]:
# Finds the difference of means between communities with a majority/minority
# of the given demographic indicator, for the given environmental indicator
def diff_of_means(df, env_ind, dem_ind):
    group_1 = df[df[dem_ind] > 0.5][env_ind].dropna()
    group_2 = df[df[dem_ind] < 0.5][env_ind].dropna()
    mean_diff = group_1.mean() - group_2.mean()
    p_val = ttest_ind(group_1, group_2, equal_var = False).pvalue
    print("Difference of means " + "(" + str(env_ind) + "): " + str(mean_diff))
    print("P-value: " + str(p_val))

In [5]:
indicators = ['CANCER', 'RESP', 'DSLPM', 'PM25', 'OZONE', 'PTRAF', 'PRE1960PCT', 'PRMP', 'PTSDF', 'PNPL', 'PWDIS', 'UST']

In [6]:
# Difference of means between minority and white communities
for env_ind in indicators:
    diff_of_means(df, env_ind, 'MINORPCT')

Difference of means (CANCER): 4.206672557282616
P-value: 0.0
Difference of means (RESP): 0.05951614466761945
P-value: 0.0
Difference of means (DSLPM): 0.1493445633348242
P-value: 0.0
Difference of means (PM25): 1.1072028284683562
P-value: 0.0
Difference of means (OZONE): 0.9504947379416322
P-value: 4.240643327418604e-167
Difference of means (PTRAF): 642.6489770271464
P-value: 0.0
Difference of means (PRE1960PCT): 0.05753586555973561
P-value: 0.0
Difference of means (PRMP): 0.5470560318960787
P-value: 0.0
Difference of means (PTSDF): 1.9601661255343528
P-value: 0.0
Difference of means (PNPL): 0.06779827883037769
P-value: 0.0
Difference of means (PWDIS): 16.319801095956436
P-value: 5.2781838735606706e-08
Difference of means (UST): 3.826914566288587
P-value: 0.0


In [7]:
# Difference of means between low-income and middle/high-income communities
for env_ind in indicators:
    diff_of_means(df, env_ind, 'LOWINCPCT')

Difference of means (CANCER): 1.7092429136932736
P-value: 9.093845445245539e-87
Difference of means (RESP): 0.020853071276029422
P-value: 4.946405683379998e-219
Difference of means (DSLPM): 0.030563847141384604
P-value: 1.1279971692241339e-124
Difference of means (PM25): 0.3003181802173991
P-value: 7.237948636984571e-219
Difference of means (OZONE): 0.1999555444522514
P-value: 6.189807558343677e-08
Difference of means (PTRAF): 257.68416537088376
P-value: 2.7182504615071557e-147
Difference of means (PRE1960PCT): 0.08652388223683605
P-value: 0.0
Difference of means (PRMP): 0.51960191051062
P-value: 0.0
Difference of means (PTSDF): 0.5304268188324484
P-value: 1.031818029692184e-148
Difference of means (PNPL): 0.018159745569498287
P-value: 9.202855243970865e-32
Difference of means (PWDIS): 1.6617927871957328
P-value: 0.4794570881728756
Difference of means (UST): 2.936890227114938
P-value: 0.0


In [8]:
# Correlation coefficients between low-income percentage
# and each environmental indicator
for env_ind in indicators:
    dff = df[['LOWINCPCT', env_ind]].dropna()
    print(str(env_ind) + str(stats.pearsonr(dff['LOWINCPCT'], dff[env_ind])))

CANCER(0.04851550492241332, 1.2627832843238108e-114)
RESP(0.04756852203158905, 2.8942171098385647e-110)
DSLPM(0.01383443889108369, 8.851725228621837e-11)
PM25(0.04899108044096308, 6.240902486152196e-115)
OZONE(0.004072405747565967, 0.05835672360946967)
PTRAF(0.06565908134310502, 4.787919424397308e-197)
PRE1960PCT(0.15010984183847137, 0.0)
PRMP(0.21813728019880801, 0.0)
PTSDF(0.040174046188724596, 2.2101226876545808e-79)
PNPL(0.013335321288064032, 3.8548416014762033e-10)
PWDIS(0.0028889289211215608, 0.26433472086117166)
UST(0.17168383559033182, 0.0)


In [9]:
# Correlation coefficients between minority percentage
# and each environmental indicator
for env_ind in indicators:
    dff = df[['MINORPCT', env_ind]].dropna()
    print(str(env_ind) + str(stats.pearsonr(dff['MINORPCT'], dff[env_ind])))

CANCER(0.20321498107655106, 0.0)
RESP(0.25697610115074343, 0.0)
DSLPM(0.3537475344433501, 0.0)
PM25(0.3462687474961802, 0.0)
OZONE(0.07696236124392096, 4.045242231899512e-281)
PTRAF(0.20903531709609308, 0.0)
PRE1960PCT(0.09949596274985359, 0.0)
PRMP(0.26535771662917174, 0.0)
PTSDF(0.2833776785094173, 0.0)
PNPL(0.12603834993092677, 0.0)
PWDIS(0.021047206716230532, 4.1894325987397515e-16)
UST(0.2683823153678841, 0.0)


In [10]:
# Separate communities by low-income and middle/high-income
low_inc = df[df['LOWINCPCT'] > 0.5]
aff = df[df['LOWINCPCT'] < 0.5]

In [11]:
# Difference of means between minority and white low-income communities
for env_ind in indicators:
    diff_of_means(low_inc, env_ind, 'MINORPCT')

Difference of means (CANCER): 3.9252263946159403
P-value: 1.4121702108818805e-178
Difference of means (RESP): 0.045895181423669495
P-value: 0.0
Difference of means (DSLPM): 0.1509254998115185
P-value: 0.0
Difference of means (PM25): 1.001323627962229
P-value: 0.0
Difference of means (OZONE): 1.2031153412529676
P-value: 5.810189114671534e-82
Difference of means (PTRAF): 540.8350722486608
P-value: 4.361924471605786e-238
Difference of means (PRE1960PCT): 0.013991671031559405
P-value: 6.128936237908461e-07
Difference of means (PRMP): 0.4889533736519781
P-value: 0.0
Difference of means (PTSDF): 1.65588548562404
P-value: 0.0
Difference of means (PNPL): 0.05627302146147467
P-value: 1.3098839509670829e-92
Difference of means (PWDIS): 6.431086188747303
P-value: 0.19455644939356626
Difference of means (UST): 2.744576321362943
P-value: 7.729410443658936e-228


In [12]:
# Difference of means between minority and white middle/high-income communities
for env_ind in indicators:
    diff_of_means(aff, env_ind, 'MINORPCT')

Difference of means (CANCER): 4.353052842559201
P-value: 0.0
Difference of means (RESP): 0.06663570792994034
P-value: 0.0
Difference of means (DSLPM): 0.16612851267379292
P-value: 0.0
Difference of means (PM25): 1.2233414783873773
P-value: 0.0
Difference of means (OZONE): 0.9579391716247088
P-value: 1.0763915149462219e-101
Difference of means (PTRAF): 686.7345503441784
P-value: 0.0
Difference of means (PRE1960PCT): 0.04045580070836763
P-value: 2.9548630234897167e-114
Difference of means (PRMP): 0.4194904732571203
P-value: 0.0
Difference of means (PTSDF): 2.226916414650155
P-value: 0.0
Difference of means (PNPL): 0.07744345728736203
P-value: 7.96536164999477e-293
Difference of means (PWDIS): 23.56330976050491
P-value: 9.120407528603095e-06
Difference of means (UST): 3.5306766350849346
P-value: 0.0


In [13]:
# Separate communities by majority POC and majority white
poc = df[df['MINORPCT'] > 0.5]
white = df[df['MINORPCT'] < 0.5]

In [14]:
# Difference of means between low-income and middle/high-income
# communities of color
for env_ind in indicators:
    diff_of_means(poc, env_ind, 'LOWINCPCT')

Difference of means (CANCER): -0.30094948497315244
P-value: 0.022798874390397558
Difference of means (RESP): -0.014712724598087923
P-value: 7.508600503419394e-54
Difference of means (DSLPM): -0.0457741163991115
P-value: 1.448198869946535e-109
Difference of means (PM25): -0.28780216043054097
P-value: 6.049037616044955e-80
Difference of means (OZONE): -0.11243619632666224
P-value: 0.06944199395030964
Difference of means (PTRAF): -86.78675532004331
P-value: 2.8848609523048066e-07
Difference of means (PRE1960PCT): 0.0602011383987599
P-value: 7.860298311217755e-146
Difference of means (PRMP): 0.36291356453618295
P-value: 5.701595012034249e-243
Difference of means (PTSDF): -0.6192074656050033
P-value: 5.621323229539969e-76
Difference of means (PNPL): -0.02225511838485844
P-value: 1.7760810140524555e-16
Difference of means (PWDIS): -14.65199414997177
P-value: 0.008416352393030157
Difference of means (UST): 1.1539550777433583
P-value: 1.377985747155353e-49


In [15]:
# Difference of means between low-income and middle/high-income
# white communities
for env_ind in indicators:
    diff_of_means(white, env_ind, 'LOWINCPCT')

Difference of means (CANCER): 0.1268769629701083
P-value: 0.07194336083788039
Difference of means (RESP): 0.006027801908182917
P-value: 2.507610605684012e-10
Difference of means (DSLPM): -0.03057110353683709
P-value: 4.167945468097786e-95
Difference of means (PM25): -0.06578431000539275
P-value: 9.554001649277625e-09
Difference of means (OZONE): -0.3576123659549211
P-value: 4.476963190484823e-15
Difference of means (PTRAF): 59.11272277547425
P-value: 2.98504933257921e-07
Difference of means (PRE1960PCT): 0.08666526807556812
P-value: 1.157884189514382e-285
Difference of means (PRMP): 0.29345066414132515
P-value: 4.235357148760728e-199
Difference of means (PTSDF): -0.048176536578887985
P-value: 0.04931120766226121
Difference of means (PNPL): -0.0010846825589710812
P-value: 0.6162365745944862
Difference of means (PWDIS): 2.480229421785837
P-value: 0.5956925889180602
Difference of means (UST): 1.94005539146535
P-value: 1.7848526712741728e-211
