In [243]:
import pandas as pd
import numpy as np
from functools import reduce
from datetime import datetime
from itertools import product
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import anderson
from scipy.stats import spearmanr
from scipy.stats import kendalltau

# 3. Hypothesis test of correlation between conflict and climate variables
- Design your hypothesis test if there is correlation/association between conflict and climate variables in each of districts
- What are your statistical metrics to reject/accept your hypothesis?
- What is your conclusion on the correlation/association between conflict and climate variables in each of districts given your statistical metrics

## 3.1 Reinstate Conflict Data from previous step

In [262]:
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d')

# Parse date_start data and set it to index
kd = pd.read_csv('exported_df/kunduz.csv', parse_dates=['date_start'], date_parser=d_parser)
kd.set_index('date_start', inplace=True)

gn = pd.read_csv('exported_df/ghazni.csv', parse_dates=['date_start'], date_parser=d_parser)
gn.set_index('date_start', inplace=True)

kb = pd.read_csv('exported_df/kabul.csv', parse_dates=['date_start'], date_parser=d_parser)
kb.set_index('date_start', inplace=True)

In [263]:
kd.head()

Unnamed: 0_level_0,event_sum,event_count
date_start,Unnamed: 1_level_1,Unnamed: 2_level_1
1989-01-31,4,1
1989-02-28,0,0
1989-03-31,0,0
1989-04-30,0,0
1989-05-31,0,0


## 3.2 Reinstate Min, Max, Avg, and Pre from previous step

In [264]:
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d')

# Parse date data and set it to index
df_min = pd.read_csv('exported_df/df_min.csv', parse_dates=['date'], date_parser=d_parser)
df_min.set_index('date', inplace=True)

df_max = pd.read_csv('exported_df/df_max.csv', parse_dates=['date'], date_parser=d_parser)
df_max.set_index('date', inplace=True)

df_avg = pd.read_csv('exported_df/df_avg.csv', parse_dates=['date'], date_parser=d_parser)
df_avg.set_index('date', inplace=True)

df_pre = pd.read_csv('exported_df/df_pre.csv', parse_dates=['date'], date_parser=d_parser)
df_pre.set_index('date', inplace=True)

In [265]:
df_min.head()

Unnamed: 0_level_0,min_kunduz,min_ghazni,min_kabul
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989-01-16,-2.7,-10.5,-8.6
1989-02-15,-1.5,-9.8,-8.0
1989-03-16,6.1,-4.4,-1.3
1989-04-16,9.6,-0.5,2.8
1989-05-16,14.8,4.0,7.6


## 3.3 Merge Conflict data with Min, Max, Avg, and Pre data
- All DataFrames have the same monthly frequency
  - DataFrame merge codes are from [StackOverflow](https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes)
- Merge each district (Kunduz, Ghazni, and Kabul) with their related columns (`min`, `max`, `avg`, `pre`)
- Rename the columns
- Sample contents of the 3 DataFrames are displayed below

In [129]:
# Merge min, max, avg, and pre DataFrame
climate_data_frames = [df_min, df_max, df_avg, df_pre]
climate_df = reduce(lambda left,right: pd.merge(left, right, on=['date'], how='outer'), climate_data_frames)

# Merge districts with their related columns
col_names = ['min_kunduz', 'max_kunduz', 'avg_kunduz', 'pre_kunduz']
kunduz_df = kd.merge(climate_df[col_names].resample('M').sum(), left_index=True, right_index=True, how='inner')

# Rename columns
kunduz_df.rename(columns={'event_sum': 'sm', 'event_count': 'ct', 
                          'min_kunduz': 'mn', 'max_kunduz': 'mx', 'avg_kunduz': 'av', 'pre_kunduz': 'pr'},
                inplace=True)

col_names = ['min_ghazni', 'max_ghazni', 'avg_ghazni', 'pre_ghazni']
ghazni_df = gn.merge(climate_df[col_names].resample('M').sum(), left_index=True, right_index=True, how='inner')

ghazni_df.rename(columns={'event_sum': 'sm', 'event_count': 'ct', 
                          'min_ghazni': 'mn', 'max_ghazni': 'mx', 'avg_ghazni': 'av', 'pre_ghazni': 'pr'},
                inplace=True)

col_names = ['min_kabul', 'max_kabul', 'avg_kabul', 'pre_kabul']
kabul_df = kb.merge(climate_df[col_names].resample('M').sum(), left_index=True, right_index=True, how='inner')

kabul_df.rename(columns={'event_sum': 'sm', 'event_count': 'ct', 
                          'min_kabul': 'mn', 'max_kabul': 'mx', 'avg_kabul': 'av', 'pre_kabul': 'pr'},
                inplace=True)

In [136]:
kunduz_df.head()

Unnamed: 0_level_0,sm,ct,mn,mx,av,pr
date_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1989-01-31,4,1,-2.7,7.5,2.4,34.600002
1989-02-28,0,0,-1.5,8.7,3.6,54.2
1989-03-31,0,0,6.1,19.0,12.5,82.4
1989-04-30,0,0,9.6,24.300001,16.9,22.5
1989-05-31,0,0,14.8,29.6,22.2,58.4


In [138]:
ghazni_df.head()

Unnamed: 0,sm,ct,mn,mx,av,pr
1989-02-28,6,1,-9.8,0.0,-4.9,37.4
1989-03-31,0,0,-4.4,7.2,1.4,127.3
1989-04-30,0,0,-0.5,11.5,5.5,31.300001
1989-05-31,0,0,4.0,15.5,9.7,35.5
1989-06-30,0,0,9.8,22.7,16.2,18.800001


In [140]:
kabul_df.head()

Unnamed: 0_level_0,sm,ct,mn,mx,av,pr
date_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1989-01-31,602,2,-8.6,1.0,-3.8,40.100002
1989-02-28,34,7,-8.0,1.4,-3.3,62.4
1989-03-31,289,5,-1.3,9.900001,4.3,101.6
1989-04-30,67,10,2.8,14.400001,8.6,34.4
1989-05-31,18,2,7.6,18.4,13.0,62.0


## 3.4 Hypothesis Test
- Pearson’s Correlation Coefficient assumes that the observations are normally distributed
- If the assumption of normality is violated, use Spearman’s Rank Correlation test
  - We first tested normality
  - If the data is normally distributed, use Pearson’s Correlation Coefficient
  - Else, use Spearman’s Rank Correlation test


- This Hypothesis Test section is based on the following sources: 
  - [Spearman's rank correlation coefficient in Wikipedia](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
  - [scipy.stats.spearmanr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html)
  - [Spearman's Rank Correlation](https://www.youtube.com/watch?v=3Grc9nVymm0)
  - [Testing the Correlation between Time Series Variables](https://www.datasciencecentral.com/profiles/blogs/testing-the-correlation-between-time-series-variables)
  - [Statistical Hypothesis Tests in Python](https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/)

### 3.4.1 Shapiro-Wilk Test for Normality
```
H0: The sample has a Gaussian distribution
H1: The sample does not have a Gaussian distribution
```
- If p-value > alpha (0.05), fail to reject H0 (meaning normally distributed)
- If p-value <= alpha (0.05), reject H0 (meaning not normally distributed)


- As displayed in the test results below, **none** of the data is normally distributed
  - We sanity checked the `shapiro_test` function using manually-created normally distributed data  
  - We also confirmed the results using D’Agostino’s K-Squared Test in Appendix A.1
- Based on the results of Normality test, we continue hypothesis testing of correlation using **Spearman’s Rank Correlation test**

In [206]:
def shapiro_test(df):
    stat, p = shapiro(df)
    if p > 0.05:
        print(f'p-value ({p:0.3f}) > 0.05, {df.columns[0]} is normally distributed')
    else:
        print(f'p-value ({p:0.3f}) <= 0.05, {df.columns[0]} is not normally distributed')

### Kunduz

In [207]:
for df in [kunduz_df[['sm']], kunduz_df[['ct']], kunduz_df[['mn']], kunduz_df[['mx']], kunduz_df[['av']], kunduz_df[['pr']]]:
    shapiro_test(df)

p-value (0.000) <= 0.05, sm is not normally distributed
p-value (0.000) <= 0.05, ct is not normally distributed
p-value (0.000) <= 0.05, mn is not normally distributed
p-value (0.000) <= 0.05, mx is not normally distributed
p-value (0.000) <= 0.05, av is not normally distributed
p-value (0.000) <= 0.05, pr is not normally distributed


### Ghazni

In [208]:
for df in [ghazni_df[['sm']], ghazni_df[['ct']], ghazni_df[['mn']], ghazni_df[['mx']], ghazni_df[['av']], ghazni_df[['pr']]]:
    shapiro_test(df)

p-value (0.000) <= 0.05, sm is not normally distributed
p-value (0.000) <= 0.05, ct is not normally distributed
p-value (0.000) <= 0.05, mn is not normally distributed
p-value (0.000) <= 0.05, mx is not normally distributed
p-value (0.000) <= 0.05, av is not normally distributed
p-value (0.000) <= 0.05, pr is not normally distributed


### Kabul

In [209]:
for df in [kabul_df[['sm']], kabul_df[['ct']], kabul_df[['mn']], kabul_df[['mx']], kabul_df[['av']], kabul_df[['pr']]]:
    shapiro_test(df)

p-value (0.000) <= 0.05, sm is not normally distributed
p-value (0.000) <= 0.05, ct is not normally distributed
p-value (0.000) <= 0.05, mn is not normally distributed
p-value (0.000) <= 0.05, mx is not normally distributed
p-value (0.000) <= 0.05, av is not normally distributed
p-value (0.000) <= 0.05, pr is not normally distributed


#### Sanity Check of Shapiro-Wilk Test for Normality
- We manually created normally distributed data and ran Shapiro-Wilk Test for Normality to verify that the created data is indeed normally distributed

In [210]:
normal_df = pd.DataFrame({'my_sample': np.random.normal(size=1000)})
shapiro_test(normal_df)

p-value (0.598) > 0.05, my_sample is normally distributed


### 3.4.2 Spearman’s Rank Correlation Test
```
H0: No correlation between datasets
H1: There is correlation between datasets
```
- If p-value > alpha (0.05), fail to reject H0 (meaning no correlation between datasets)
- If p-value <= alpha (0.05), reject H0 (meaning there is correlation between datasets)


- The results of correlation test are displayed in the next Section "3.4.3 Summary of correlation between conflict and climate variables by districts"
  - We found correlations between sum of people/event count and temperature
  - We did not find correlation between sum of people/event count and precipitation
  
- We sanity checked the `spearman_test` function using sum of people and event count, which we believe they are correlated accounding to how we curated the data
- We also confirmed the results using Kendall’s Rank Correlation Test in Appendix A.2

In [189]:
def spearman_test(df1, df2):
    stat, p = spearmanr(df1, df2)
    if p > 0.05:
        print(f'p-value ({p:0.3f}) > 0.05, no correlation between {df1.columns[0]} and {df2.columns[0]}')
    else:
        print(f'p-value ({p:0.3f}) <= 0.05, correlation exists between {df1.columns[0]} and {df2.columns[0]}')

### Kunduz

In [190]:
for df1, df2 in product([kunduz_df[['sm']], kunduz_df[['ct']]], 
                        [kunduz_df[['mn']], kunduz_df[['mx']], kunduz_df[['av']], kunduz_df[['pr']]]):
    spearman_test(df1, df2)

p-value (0.063) > 0.05, no correlation between sm and mn
p-value (0.116) > 0.05, no correlation between sm and mx
p-value (0.075) > 0.05, no correlation between sm and av
p-value (0.334) > 0.05, no correlation between sm and pr
p-value (0.026) <= 0.05, correlation exists between ct and mn
p-value (0.062) > 0.05, no correlation between ct and mx
p-value (0.036) <= 0.05, correlation exists between ct and av
p-value (0.437) > 0.05, no correlation between ct and pr


### Ghazni

In [191]:
for df1, df2 in product([ghazni_df[['sm']], ghazni_df[['ct']]], 
                        [ghazni_df[['mn']], ghazni_df[['mx']], ghazni_df[['av']], ghazni_df[['pr']]]):
    spearman_test(df1, df2)

p-value (0.019) <= 0.05, correlation exists between sm and mn
p-value (0.026) <= 0.05, correlation exists between sm and mx
p-value (0.022) <= 0.05, correlation exists between sm and av
p-value (0.976) > 0.05, no correlation between sm and pr
p-value (0.015) <= 0.05, correlation exists between ct and mn
p-value (0.021) <= 0.05, correlation exists between ct and mx
p-value (0.018) <= 0.05, correlation exists between ct and av
p-value (0.760) > 0.05, no correlation between ct and pr


### Kabul

In [192]:
for df1, df2 in product([kabul_df[['sm']], kabul_df[['ct']]], 
                        [kabul_df[['mn']], kabul_df[['mx']], kabul_df[['av']], kabul_df[['pr']]]):
    spearman_test(df1, df2)

p-value (0.007) <= 0.05, correlation exists between sm and mn
p-value (0.006) <= 0.05, correlation exists between sm and mx
p-value (0.006) <= 0.05, correlation exists between sm and av
p-value (0.077) > 0.05, no correlation between sm and pr
p-value (0.001) <= 0.05, correlation exists between ct and mn
p-value (0.002) <= 0.05, correlation exists between ct and mx
p-value (0.001) <= 0.05, correlation exists between ct and av
p-value (0.120) > 0.05, no correlation between ct and pr


#### Sanity Check of Spearman’s Rank Correlation Test
- Based on how we curated the sum of people and the event count, they should be correlated
- We ran Spearman’s Rank Correlation Test and found that they are indeed correlated

In [196]:
spearman_test(kunduz_df[['sm']], kunduz_df[['ct']])

p-value (0.000) <= 0.05, correlation exists between sm and ct


In [195]:
spearman_test(ghazni_df[['sm']], ghazni_df[['ct']])

p-value (0.000) <= 0.05, correlation exists between sm and ct


In [194]:
spearman_test(kabul_df[['sm']], kabul_df[['ct']])

p-value (0.000) <= 0.05, correlation exists between sm and ct


## 3.4.3 Summary of correlation between conflict and climate variables by districts
<br>

|Kunduz        | Min Temperature | Max Temperature | Average Temperature | Precipitation |
|:-------------|:---------------:|:---------------:|:-------------------:|:-------------:|
|Sum of People |        ---      |       ---       |        ---          |     ---       |
|Event Count   |         Y       |       ---       |         Y           |     ---       |

|Ghazni        | Min Temperature | Max Temperature | Average Temperature | Precipitation |
|:-------------|:---------------:|:---------------:|:-------------------:|:-------------:|
|Sum of People |         Y       |        Y        |         Y           |     ---       |
|Event Count   |         Y       |        Y        |         Y           |     ---       |

|Kabul         | Min Temperature | Max Temperature | Average Temperature | Precipitation |
|:-------------|:---------------:|:---------------:|:-------------------:|:-------------:|
|Sum of People |         Y       |        Y        |         Y           |     ---       |
|Event Count   |         Y       |        Y        |         Y           |     ---       |

Notes:
- 'Y' represents correlations
  - For example, in Kabul, correlation exists between sum of people and minimum temperature
- '---' represents no correlation
  - For example, in Kabul, no correlation between sum of people and precipitation

# Appendix

## A.1 D’Agostino’s K-Squared Test for Normality
- We obtained the same results as Shapiro-Wilk Test above

In [237]:
def dagostino_test(df):
    stat, p = normaltest(df)
    if p > 0.05:
        print(f'p-value > 0.05, {df.columns[0]} is normally distributed; p-value = {p}')
    else:
        print(f'p-value <= 0.05, {df.columns[0]} is not normally distributed; p-value = {p}')

In [238]:
print('Kunduz----------')
for df in [kunduz_df[['sm']], kunduz_df[['ct']], kunduz_df[['mn']], kunduz_df[['mx']], kunduz_df[['av']], kunduz_df[['pr']]]:
    dagostino_test(df)

print('\nGhazni----------')
for df in [ghazni_df[['sm']], ghazni_df[['ct']], ghazni_df[['mn']], ghazni_df[['mx']], ghazni_df[['av']], ghazni_df[['pr']]]:
    dagostino_test(df)

print('\nKabul----------')
for df in [kabul_df[['sm']], kabul_df[['ct']], kabul_df[['mn']], kabul_df[['mx']], kabul_df[['av']], kabul_df[['pr']]]:
    dagostino_test(df)

Kunduz----------
p-value <= 0.05, sm is not normally distributed; p-value = [2.12245497e-59]
p-value <= 0.05, ct is not normally distributed; p-value = [1.45367686e-65]
p-value <= 0.05, mn is not normally distributed; p-value = [4.38970668e-228]
p-value <= 0.05, mx is not normally distributed; p-value = [0.]
p-value <= 0.05, av is not normally distributed; p-value = [0.]
p-value <= 0.05, pr is not normally distributed; p-value = [8.16658065e-10]

Ghazni----------
p-value <= 0.05, sm is not normally distributed; p-value = [3.14449226e-62]
p-value <= 0.05, ct is not normally distributed; p-value = [8.86451781e-50]
p-value <= 0.05, mn is not normally distributed; p-value = [0.]
p-value <= 0.05, mx is not normally distributed; p-value = [0.]
p-value <= 0.05, av is not normally distributed; p-value = [0.]
p-value <= 0.05, pr is not normally distributed; p-value = [4.7204836e-20]

Kabul----------
p-value <= 0.05, sm is not normally distributed; p-value = [4.16374918e-166]
p-value <= 0.05, ct

#### Sanity Check of D’Agostino’s K-Squared Test for Normality

In [244]:
normal_df = pd.DataFrame({'my_sample': np.random.normal(size=1000)})
dagostino_test(normal_df)

p-value > 0.05, my_sample is normally distributed; p-value = [0.2034792]


## A.2 Kendall’s Rank Correlation Test
- We obtained the same results as Spearman’s Rank Correlation Test above

In [246]:
def kendall_test(df1, df2):
    stat, p = kendalltau(df1, df2)
    if p > 0.05:
        print(f'p-value ({p:0.3f}) > 0.05, no correlation between {df1.columns[0]} and {df2.columns[0]}')
    else:
        print(f'p-value ({p:0.3f}) <= 0.05, correlation exists between {df1.columns[0]} and {df2.columns[0]}')

In [247]:
print('Kunduz----------')
for df1, df2 in product([kunduz_df[['sm']], kunduz_df[['ct']]], 
                        [kunduz_df[['mn']], kunduz_df[['mx']], kunduz_df[['av']], kunduz_df[['pr']]]):
    kendall_test(df1, df2)

print('\nGhazni----------')
for df1, df2 in product([ghazni_df[['sm']], ghazni_df[['ct']]], 
                        [ghazni_df[['mn']], ghazni_df[['mx']], ghazni_df[['av']], ghazni_df[['pr']]]):
    kendall_test(df1, df2)

print('\nKabul----------')
for df1, df2 in product([kabul_df[['sm']], kabul_df[['ct']]], 
                        [kabul_df[['mn']], kabul_df[['mx']], kabul_df[['av']], kabul_df[['pr']]]):
    kendall_test(df1, df2)

Kunduz----------
p-value (0.064) > 0.05, no correlation between sm and mn
p-value (0.116) > 0.05, no correlation between sm and mx
p-value (0.074) > 0.05, no correlation between sm and av
p-value (0.327) > 0.05, no correlation between sm and pr
p-value (0.023) <= 0.05, correlation exists between ct and mn
p-value (0.058) > 0.05, no correlation between ct and mx
p-value (0.033) <= 0.05, correlation exists between ct and av
p-value (0.434) > 0.05, no correlation between ct and pr

Ghazni----------
p-value (0.025) <= 0.05, correlation exists between sm and mn
p-value (0.034) <= 0.05, correlation exists between sm and mx
p-value (0.029) <= 0.05, correlation exists between sm and av
p-value (0.953) > 0.05, no correlation between sm and pr
p-value (0.018) <= 0.05, correlation exists between ct and mn
p-value (0.026) <= 0.05, correlation exists between ct and mx
p-value (0.021) <= 0.05, correlation exists between ct and av
p-value (0.767) > 0.05, no correlation between ct and pr

Kabul-------

#### Sanity Check of Kendall’s Rank Correlation Test

In [249]:
print('Kunduz----------')
kendall_test(kunduz_df[['sm']], kunduz_df[['ct']])

print('\nGhazni----------')
kendall_test(ghazni_df[['sm']], ghazni_df[['ct']])

print('\nKabul----------')
kendall_test(kabul_df[['sm']], kabul_df[['ct']])

Kunduz----------
p-value (0.000) <= 0.05, correlation exists between sm and ct

Ghazni----------
p-value (0.000) <= 0.05, correlation exists between sm and ct

Kabul----------
p-value (0.000) <= 0.05, correlation exists between sm and ct


## References
- [DataFrame merge code in StackOverflow](https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes)
- [Spearman's rank correlation coefficient in Wikipedia](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
- [scipy.stats.spearmanr document](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html)
- [Spearman's Rank Correlation](https://www.youtube.com/watch?v=3Grc9nVymm0)
- [Testing the Correlation between Time Series Variables](https://www.datasciencecentral.com/profiles/blogs/testing-the-correlation-between-time-series-variables)
- [Statistical Hypothesis Tests in Python](https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/)