In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import statistics
from scipy.stats import mannwhitneyu

%matplotlib inline

In [2]:
df = pd.read_csv('../data/lasso-covid19India.csv')

df = df[df['Deceased'] >= 5]
df['Mortality'] = df['Deceased'] / df['Confirmed']
df.head()

Unnamed: 0,State_Code,State,District,Confirmed,Active,Recovered,Deceased,"Sex ratio of the total population (females per 1,000 males)",Women whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%),Men whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2) (%),Women who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%),Men who are overweight or obese (BMI ≥ 25.0 kg/m2) (%),All women age 15-49 years who are anaemic (%),Men age 15-49 years who are anaemic (<13.0 g/dl) (%),Women Blood sugar level - high (>140 mg/dl) (%),Men Blood sugar level - high (>140 mg/dl) (%),Women Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%),Men Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%),Mortality
0,MH,Maharashtra,Pune,134913,39971,91606,3336,924,17.8,10.7,30.2,33.4,50.0,17.4,4.6,4.0,0.2,1.1,0.024727
1,MH,Maharashtra,Mumbai,130410,17693,105193,7222,906,17.8,14.9,34.0,37.9,49.4,23.0,11.8,14.0,0.2,0.0,0.055379
2,TN,Tamil Nadu,Chennai,119059,12106,104455,2498,1032,9.4,4.3,33.6,31.3,53.9,10.1,9.7,9.3,0.4,3.0,0.020981
3,MH,Maharashtra,Thane,115923,19541,92981,3400,922,21.4,13.1,29.0,27.4,48.4,17.1,4.1,4.5,0.6,0.0,0.02933
4,KA,Karnataka,Bangalore,94106,33081,59492,1532,898,14.0,8.7,32.0,26.1,39.6,20.5,8.3,10.9,0.4,0.7,0.01628


In [3]:
m = ['Men whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2) (%)',
       'Men who are overweight or obese (BMI ≥ 25.0 kg/m2) (%)',
       'Men age 15-49 years who are anaemic (<13.0 g/dl) (%)',
       'Men Blood sugar level - high (>140 mg/dl) (%)',
       'Men Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)']

f = ['Women whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)',
     'Women who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
     'All women age 15-49 years who are anaemic (%)',
     'Women Blood sugar level - high (>140 mg/dl) (%)',
     'Women Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)',
    ]

a = ['Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)',
     'Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
     'Avg_age 15-49 years who are anaemic (%)',
     'Avg_Blood sugar level - high (>140 mg/dl) (%)',
     'Avg_Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)'
    ]

for i in range(len(m)):
    df[a[i]] = (df[m[i]] * 1000 + df[f[i]] * df['Sex ratio of the total population (females per 1,000 males)']) / (1000 + df['Sex ratio of the total population (females per 1,000 males)'])

In [5]:
q2 = np.quantile(df['Mortality'], .5)
q1 = df[df['Mortality'] < q2]
q3 = df[df['Mortality'] > q2]
print('m :', len(q1), 'n :', len(q3))

m : 205 n : 205


In [6]:
def calculate_u_crit(q1,q3, z=1.96):
    m = len(q1)
    n = len(q3)
    u_crit = m*n/2 - z*(m*n*(m+n+1)/12)**0.5
    return u_crit

In [7]:
def calc_diff_matrix(q1,q3, u_crit):
    q1_len = len(q1)
    q3_len = len(q3)
    diff_list = list()
    
    for _ in range(q3_len):
        diff_list.append([0]*q1_len)
        
    for i in range(q1_len):
        for j in range(q3_len):
            diff_list[i][j] = round(q1[i] - q3[j], 4)
            
    sorted_diff_list = list()
    for i in range(len(diff_list)):
        sorted_diff_list.extend(diff_list[i])
    
    sorted_diff_list = sorted(sorted_diff_list)
    print("Median: ", statistics.median(sorted_diff_list),"CI: ", end = " ")
    print("(",sorted_diff_list[u_crit-1],",",sorted_diff_list[-u_crit],")", "U_Crit: ", u_crit)

In [8]:
def run_mann_whitney_tests(q1, q3, covariate_names):
    for covariate in covariate_names:
        print("\n"+covariate+"\n")
        m_q1 = list(q1[covariate])
        m_q3 = list(q3[covariate])
        
        n1 = len(q1)
        n2 = len(q3)
        
        u_crit = calculate_u_crit(m_q1, m_q3)
        
        if(u_crit - int(u_crit) == 0):
            calc_diff_matrix(m_q1,m_q3,int(u_crit))
            
            stat, pr = mannwhitneyu(m_q1, m_q3,alternative='two-sided')
            print('stat=%.10f, p=%.10f' % (stat, pr))
            if pr > 0.05:
                print('Not significant')
            else:
                print('significant')
            Z = (stat - ((n1*n2)/2))/math.sqrt((n1*n2*(n1 + n2 + 1)) / 12)
            E = math.sqrt(Z**2/(n1 + n2 - 1))
            print('Z-score', Z)
            print('Effective size', E)
            
        else:
            calc_diff_matrix(m_q1,m_q3,int(u_crit))
            calc_diff_matrix(m_q1,m_q3,int(u_crit)+1)
            
            stat, pr = mannwhitneyu(m_q1, m_q3,alternative='two-sided')
            print('stat=%.10f, p=%.10f' % (stat, pr))
            if pr > 0.05:
                print('Not significant')
            else:
                print('significant')
            Z = (stat - ((n1*n2)/2))/math.sqrt((n1*n2*(n1 + n2 + 1)) / 12)
            E = math.sqrt(Z**2/(n1 + n2 - 1))
            print('Z-score', Z)
            print('Effective size', E)

In [9]:
cols = ['Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)',
              'Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)',
              'Avg_age 15-49 years who are anaemic (%)',            
              'Avg_Blood sugar level - high (>140 mg/dl) (%)', 
              'Avg_Very high (Systolic ≥180 mm of Hg and/or Diastolic ≥110 mm of Hg) (%)'
             ]

run_mann_whitney_tests(q1, q3, cols)


Avg_whose Body Mass Index (BMI) is below normal (BMI < 18.5 kg/m2)14 (%)

Median:  1.9194 CI:  ( 0.3878 , 3.3999 ) U_Crit:  18661
Median:  1.9194 CI:  ( 0.3886 , 3.3998 ) U_Crit:  18662
stat=23975.0000000000, p=0.0135534672
significant
Z-score 2.4693020107179553
Effective size 0.12209912623048842

Avg_who are overweight or obese (BMI ≥ 25.0 kg/m2)14 (%)

Median:  -3.2382 CI:  ( -4.7681 , -1.695 ) U_Crit:  18661
Median:  -3.2382 CI:  ( -4.7675 , -1.695 ) U_Crit:  18662
stat=16096.5000000000, p=0.0000418239
significant
Z-score -4.097582678376192
Effective size 0.20261242347648306

Avg_age 15-49 years who are anaemic (%)

Median:  4.0033 CI:  ( 2.3141 , 5.6205 ) U_Crit:  18661
Median:  4.0033 CI:  ( 2.3179 , 5.6177 ) U_Crit:  18662
stat=26607.0000000000, p=0.0000031207
significant
Z-score 4.6631257718013845
Effective size 0.2305767296865713

Avg_Blood sugar level - high (>140 mg/dl) (%)

Median:  0.2264 CI:  ( -0.2553 , 0.7028 ) U_Crit:  18661
Median:  0.2264 CI:  ( -0.2552 , 0.7027 ) U_