In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import statistics
from scipy.stats import mannwhitneyu

%matplotlib inline

In [2]:
d = pd.read_csv('../data/lasso-KA+TN-bulletin.csv')
d = d[d['District'] != 'Erode.pdf']
d.head()

Unnamed: 0,District,Sex Ratio (females every 1000 males),Total_Positives,total_deaths,male_deaths,female_deaths,Male cases (in data),Female cases (in data),Estimated Male cases,Estimated Female Cases,...,Women whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%),Men whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2) (%),Women who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%),Men who are overweight or obese (BMI ≥ 25.0 kg/m2) (%),All women age 15-49 years who are anaemic (%),Men age 15-49 years who are anaemic (<13.0 g/dl) (%),Women Blood sugar level - high (>140 mg/dl) (%),Men Blood sugar level - high (>140 mg/dl) (%),Women Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%),Men Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)
0,Bangalore,898,94106,299.0,196,103,14096,8401,39172.03197,23345.9308,...,14.0,8.7,32.0,26.1,39.6,20.5,8.3,10.9,0.4,0.7
1,Bellary,952,15180,31.0,23,8,1396,634,6935.034971,3149.57892,...,23.6,18.4,18.9,20.9,49.9,20.0,4.0,6.5,0.6,2.5
2,Mysore,1008,11489,55.0,37,18,822,445,4951.820981,2680.730336,...,19.1,10.2,29.3,20.2,45.6,9.1,5.9,7.3,0.5,1.2
3,Dakshina Kannada,1032,9296,43.0,32,11,1686,899,4027.918193,2147.745228,...,25.6,21.8,26.0,20.4,45.4,15.7,5.5,9.0,1.4,1.0
4,Gulbarga,989,9265,27.0,16,11,1219,774,3764.690985,2390.378033,...,22.5,20.5,14.8,18.8,43.1,14.9,3.6,8.5,0.4,0.4


In [3]:
df = d[(d['male_deaths'] >= 5) & (d['female_deaths'] >= 5)]

df['Male Mortality'] = pd.to_numeric(df["Male Mortality"])
df['Female Mortality'] = pd.to_numeric(df["Female Mortality"])

df['Mortality'] = (df['male_deaths'] + df['female_deaths']) / (df['Estimated Female Cases'] + df['Estimated Male cases'])

m = ['Men whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2) (%)',
     'Men who are overweight or obese (BMI ≥ 25.0 kg/m2) (%)',
     'Men age 15-49 years who are anaemic (<13.0 g/dl) (%)',
     'Men Blood sugar level - high (>140 mg/dl) (%)',
     'Men Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)']

f = ['Women whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)',
     'Women who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
     'All women age 15-49 years who are anaemic (%)',
     'Women Blood sugar level - high (>140 mg/dl) (%)',
     'Women Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)',
    ]

a = ['Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)',
     'Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
     'Avg_age 15-49 years who are anaemic (%)',
     'Avg_Blood sugar level - high (>140 mg/dl) (%)',
     'Avg_Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)'
    ]

for i in range(len(m)):
    df[a[i]] = (df[m[i]] * 1000 + df[f[i]] * df['Sex Ratio (females every 1000 males)']) / (1000 + df['Sex Ratio (females every 1000 males)'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [4]:
q2 = np.quantile(df['Mortality'], .5)
q1 = df[df['Mortality'] < q2]
q3 = df[df['Mortality'] > q2]

In [5]:
def calculate_u_crit(q1, q3, z=1.96):
    m = len(q1)
    n = len(q3)
    u_crit = m*n/2 - z*(m*n*(m+n+1)/12)**0.5
    return u_crit

In [6]:
def calc_diff_matrix(q1,q3, u_crit):
    q1_len = len(q1)
    q3_len = len(q3)
    diff_list = list()
    
    for _ in range(q3_len):
        diff_list.append([0]*q1_len)
        
    for i in range(q1_len):
        for j in range(q3_len):
            diff_list[i][j] = round(q1[i] - q3[j], 4)
            
    sorted_diff_list = list()
    for i in range(len(diff_list)):
        sorted_diff_list.extend(diff_list[i])
    
    sorted_diff_list = sorted(sorted_diff_list)
    print("Median: ", statistics.median(sorted_diff_list),"CI: ", end = " ")
    print("(",sorted_diff_list[u_crit-1],",",sorted_diff_list[-u_crit],")", "U_Crit: ", u_crit)

In [7]:
def run_mann_whitney_tests(q1, q3, covariate_names):
    for covariate in covariate_names:
        print("\n"+covariate+"\n")
        m_q1 = list(q1[covariate])
        m_q3 = list(q3[covariate])
        
        n1 = len(q1)
        n2 = len(q3)
        
        u_crit = calculate_u_crit(m_q1, m_q3)
        
        if(u_crit - int(u_crit) == 0):
            calc_diff_matrix(m_q1,m_q3,int(u_crit))
            
            stat, pr = mannwhitneyu(m_q1, m_q3,alternative='two-sided')
            print('stat=%.10f, p=%.10f' % (stat, pr))
            if pr > 0.05:
                print('Not significant')
            else:
                print('significant')
            Z = (stat - ((n1*n2)/2))/math.sqrt((n1*n2*(n1 + n2 + 1)) / 12)
            E = math.sqrt(Z**2/(n1 + n2 - 1))
            print('Z-score', Z)
            print('Effective size', E)
            
        else:
            calc_diff_matrix(m_q1,m_q3,int(u_crit))
            calc_diff_matrix(m_q1,m_q3,int(u_crit)+1)
            
            stat, pr = mannwhitneyu(m_q1, m_q3,alternative='two-sided')
            print('stat=%.10f, p=%.10f' % (stat, pr))
            if pr > 0.05:
                print('Not significant')
            else:
                print('significant')
            Z = (stat - ((n1*n2)/2))/math.sqrt((n1*n2*(n1 + n2 + 1)) / 12)
            E = math.sqrt(Z**2/(n1 + n2 - 1))
            print('Z-score', Z)
            print('Effective size', E)

In [8]:
covariates = ['Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)', 
              'Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
              'Avg_age 15-49 years who are anaemic (%)',
              'Avg_Blood sugar level - high (>140 mg/dl) (%)',
              'Avg_Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)'
             ]

run_mann_whitney_tests(q1, q3, covariates)


Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)

Median:  -6.3854500000000005 CI:  ( -9.9196 , -2.4006 ) U_Crit:  55
Median:  -6.3854500000000005 CI:  ( -9.8253 , -2.5172 ) U_Crit:  56
stat=32.0000000000, p=0.0026161516
significant
Z-score -3.032546884255477
Effective size 0.5836139199850202

Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)

Median:  7.38725 CI:  ( 2.7104 , 11.6316 ) U_Crit:  55
Median:  7.38725 CI:  ( 2.738 , 11.3565 ) U_Crit:  56
stat=164.0000000000, p=0.0026161516
significant
Z-score 3.032546884255477
Effective size 0.5836139199850202

Avg_age 15-49 years who are anaemic (%)

Median:  5.420999999999999 CI:  ( 1.8262 , 8.7881 ) U_Crit:  55
Median:  5.420999999999999 CI:  ( 1.8914 , 8.7258 ) U_Crit:  56
stat=161.0000000000, p=0.0040823159
significant
Z-score 2.894703844062046
Effective size 0.5570860145311556

Avg_Blood sugar level - high (>140 mg/dl) (%)

Median:  0.6037 CI:  ( -0.6775 , 1.8809 ) U_Crit:  55
Median:  0.6037 CI:  ( -0