# Statistical Analysis

### This cell contains all past models and distances analyzed

In [None]:
distance = ['100km', '200km', '300km', 'na_basin']
model_p = ['REF.COMB', 'REF.COMB', 'HadGEM3-GC31-HH(19852014)', 'CNRM-CM6-1-HR(19852014)', 
           'EC-Earth3P-HR(19852014)']
model_f = ['RCP45.COMB', 'RCP85.COMB', 'HadGEM3-GC31-HH(20212050)', 'CNRM-CM6-1-HR(20212050)', 
           'EC-Earth3P-HR(20212050)']

### Program to print all the relevant stats about intensity distributions and translation speed

In [4]:
import xarray as xr
import numpy as np
from tc_functions.calculations import *

distance = ['100km', '200km', '300km', 'na_basin']
general_model = ['EC-Earth3P-HR', 'EC-Earth3P-HR']
model = ['EC-Earth3P-HR(19852014)', 'EC-Earth3P-HR(20212050)']

for i in range(len(model)):
    print(f'{model[i]:}')
    for j in range(len(distance)):
        print(f'{distance[j]}')    
        start = f'HighResMIP/{general_model[i]}/{distance[j]}' #get name of input track file
        
        if distance[j] == 'na_basin':              
            track_file = f'{start}/{model[i]}.NA.storms.nc'   
        else:
            track_file = f'{start}/{model[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            
        DS = xr.open_dataset(track_file)          #open file and extract intensity values (convert to m/s if ibtracs)
        if track_file.count('IBTrACS') == 1:
            max_w = DS.vmax_2D.values*0.5144444444444444
        else:
            max_w = DS.vmax_2D.values
        DS.close()
        
        #calculate values, round them, and print results
        
        print(f'{track_file}:')
        percent = intensity_prob(max_w, 50.0, direction='greater')*100  
        percent = round(percent, 2)
        print(f'\t percent above 50.0m/s = {percent}')
        
        allintdist_med = np.median(get_all_intdist(track_file)).round(2)
        print(f'\t All int dist median = {allintdist_med}')
        
        maxintdist_med = np.median(get_max_intdist(track_file)).round(2)
        print(f'\t Max int dist median = {maxintdist_med}')
        
        avgintdist_med = np.median(get_avg_intdist(track_file)).round(2)
        print(f'\t Avg int dist median = {avgintdist_med}')
        
        tsdist_med = np.median(get_6hr_tsdist(track_file)).round(2)
        print(f'\t Ts dist median = {tsdist_med}')

EC-Earth3P-HR(19852014)
100km
HighResMIP/EC-Earth3P-HR/100km/EC-Earth3P-HR(19852014).NA.landfalling.storms.100km.buffer.pts.nc:
	 percent above 50.0m/s = 0.0
	 All int dist median = 16.600000381469727
	 Max int dist median = 18.420000076293945
	 Avg int dist median = 17.06999969482422
	 Ts dist median = 13.02
200km
HighResMIP/EC-Earth3P-HR/200km/EC-Earth3P-HR(19852014).NA.landfalling.storms.200km.buffer.pts.nc:
	 percent above 50.0m/s = 0.0
	 All int dist median = 16.360000610351562
	 Max int dist median = 19.200000762939453
	 Avg int dist median = 17.229999542236328
	 Ts dist median = 11.8
300km
HighResMIP/EC-Earth3P-HR/300km/EC-Earth3P-HR(19852014).NA.landfalling.storms.300km.buffer.pts.nc:
	 percent above 50.0m/s = 0.0
	 All int dist median = 16.40999984741211
	 Max int dist median = 19.1200008392334
	 Avg int dist median = 16.989999771118164
	 Ts dist median = 13.21
na_basin
HighResMIP/EC-Earth3P-HR/na_basin/EC-Earth3P-HR(19852014).NA.storms.nc:
	 percent above 50.0m/s = 0.0
	 All 

  all_intdist = all_intdist[all_intdist > 0] #remove 0 entries
  storm = storm[storm > 0]


	 Max int dist median = 20.229999542236328
	 Avg int dist median = 18.489999771118164
	 Ts dist median = 17.59
300km
HighResMIP/EC-Earth3P-HR/300km/EC-Earth3P-HR(20212050).NA.landfalling.storms.300km.buffer.pts.nc:
	 percent above 50.0m/s = 0.0
	 All int dist median = 18.31999969482422
	 Max int dist median = 21.290000915527344
	 Avg int dist median = 18.729999542236328
	 Ts dist median = 17.33
na_basin
HighResMIP/EC-Earth3P-HR/na_basin/EC-Earth3P-HR(20212050).NA.storms.nc:
	 percent above 50.0m/s = 0.0
	 All int dist median = 17.200000762939453
	 Max int dist median = 23.25
	 Avg int dist median = 17.31999969482422
	 Ts dist median = 20.87


### Program to print all the relevant stats about TC count/frequency

In [1]:
import xarray as xr
import numpy as np
from tc_functions.calculations import get_total_storms, get_total_points

distance = ['100km', '200km', '300km', 'na_basin']
general_model = ['EC-Earth3P-HR', 'EC-Earth3P-HR']
model = ['EC-Earth3P-HR(19852014)', 'EC-Earth3P-HR(20212050)']

for i in range(len(model)):
    print(f'{model[i]:}')
    for j in range(len(distance)):
        print(f'{distance[j]}')    
        start = f'HighResMIP/{general_model[i]}/{distance[j]}' #get name of input track file
        
        if distance[j] == 'na_basin':              
            track_file = f'{start}/{model[i]}.NA.storms.nc'   
        else:
            track_file = f'{start}/{model[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            
        total_storms = get_total_storms(track_file)            #calculate desired values
        total_points = get_total_points(track_file)
        
        print(f'{track_file}:')                                #print results
        print(f' \t num_storms = {total_storms}')
        print(f' \t num_pts = {total_points}')

EC-Earth3P-HR(19852014)
100km
HighResMIP/EC-Earth3P-HR/100km/EC-Earth3P-HR(19852014).NA.landfalling.storms.100km.buffer.pts.nc:
 	 num_storms = 18
 	 num_pts = 84
200km
HighResMIP/EC-Earth3P-HR/200km/EC-Earth3P-HR(19852014).NA.landfalling.storms.200km.buffer.pts.nc:
 	 num_storms = 23
 	 num_pts = 175
300km
HighResMIP/EC-Earth3P-HR/300km/EC-Earth3P-HR(19852014).NA.landfalling.storms.300km.buffer.pts.nc:
 	 num_storms = 25
 	 num_pts = 251
na_basin
HighResMIP/EC-Earth3P-HR/na_basin/EC-Earth3P-HR(19852014).NA.storms.nc:
 	 num_storms = 102
 	 num_pts = 3753
EC-Earth3P-HR(20212050)
100km
HighResMIP/EC-Earth3P-HR/100km/EC-Earth3P-HR(20212050).NA.landfalling.storms.100km.buffer.pts.nc:
 	 num_storms = 19
 	 num_pts = 76
200km
HighResMIP/EC-Earth3P-HR/200km/EC-Earth3P-HR(20212050).NA.landfalling.storms.200km.buffer.pts.nc:
 	 num_storms = 24
 	 num_pts = 132
300km
HighResMIP/EC-Earth3P-HR/300km/EC-Earth3P-HR(20212050).NA.landfalling.storms.300km.buffer.pts.nc:
 	 num_storms = 28
 	 num_pts =

  storm = storm[storm > 0]     #remove 0's and nans
  total_array = total_array[total_array > 0]  # remove 0 entries


### Statistical analysis program (intensity and translation speed)

It conducts 2 sided KS tests on all 3 intensity distributions (all, max, avg) and 6 hr consecutive translation speed for past and future simulations of an arbitrary amount of models.  

In [6]:
import xarray as xr
import numpy as np
from scipy import stats
import math
from tc_functions.calculations import *

#specify the models and distances to be analyzed--------------------------------------------------------------------------

alt_hyp = ['less', 'greater']                      
distance = ['100km', '200km', '300km', 'na_basin']
general_model = ['HadGEM3-GC31-HH', 'CNRM-CM6-1-HR']
model_p = ['HadGEM3-GC31-HH(19852014)', 'CNRM-CM6-1-HR(19852014)']
model_f = ['HadGEM3-GC31-HH(20212050)', 'CNRM-CM6-1-HR(20212050)']
num_models = len(model_p)

#KS test on distribution of all intensities-------------------------------------------------------------------------------

for i in range(num_models):
    for j in range(len(distance)):
                                    #specify names for input and output files
        output_file = f'HighResMIP/{general_model[i]}/kstest_allintdist.txt'
        start = f'HighResMIP/{general_model[i]}/{distance[j]}'
        
        if distance[j] == 'na_basin':              
            tf_past = f'{start}/{model_p[i]}.NA.storms.nc'   
            tf_future = f'{start}/{model_f[i]}.NA.storms.nc'
        else:
            tf_past = f'{start}/{model_p[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            tf_future = f'{start}/{model_f[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'

        nstorms_p = get_total_storms(tf_past)      #get correct number of TCs in each file
        nstorms_f = get_total_storms(tf_future)
        all_intdist_p = get_all_intdist(tf_past)   #get first all intensity distribution
        all_intdist_f = get_all_intdist(tf_future) #get second all intensity distribution
  
        for k in range(len(alt_hyp)):              #conduct 2 sample KS test and output results to text file
            statistic, pvalue = stats.ks_2samp(all_intdist_p, all_intdist_f, alternative=alt_hyp[k])  

            with open(output_file, 'a') as f:      #write to output file depending on result
                f.write(f'2 Sample KS test on the all intensity distributions from the files (alt = {alt_hyp[k]}): \n')
                f.write(f'{tf_past}: num storms = {nstorms_p} \n')
                f.write(f'{tf_future}: num storms = {nstorms_f} \n')
                f.write(f'statistic = {statistic} \n')
                f.write(f'p value = {pvalue} \n')
                if pvalue < 0.05:
                    f.write('STATISTICALLY SIGNIFICANT DIFFERENCE \n')
                f.write('\n')

#KS test for the distribution of maximum intensities----------------------------------------------------------------------

for i in range(num_models):
    for j in range(len(distance)):
                                    #specify names for input and output files
        output_file = f'HighResMIP/{general_model[i]}/kstest_maxintdist.txt'
        start = f'HighResMIP/{general_model[i]}/{distance[j]}'
        
        if distance[j] == 'na_basin':              
            tf_past = f'{start}/{model_p[i]}.NA.storms.nc'   
            tf_future = f'{start}/{model_f[i]}.NA.storms.nc'
        else:
            tf_past = f'{start}/{model_p[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            tf_future = f'{start}/{model_f[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            
        max_intdist_p = get_max_intdist(tf_past)   #get first max intensity distribution
        max_intdist_f = get_max_intdist(tf_future) #get second max intensity distribution
  
        for k in range(len(alt_hyp)):              #conduct 2 sample KS test and output results to text file
            statistic, pvalue = stats.ks_2samp(max_intdist_p, max_intdist_f, alternative=alt_hyp[k])  

            with open(output_file, 'a') as f:      #write to output file depending on result
                f.write(f'2 Sample KS test on the max intensity distributions from the files (alt = {alt_hyp[k]}): \n')
                f.write(f'{tf_past}: num storms = {len(max_intdist_p)} \n')
                f.write(f'{tf_future}: num storms = {len(max_intdist_f)} \n')
                f.write(f'statistic = {statistic} \n')
                f.write(f'p value = {pvalue} \n')
                if pvalue < 0.05:
                    f.write('STATISTICALLY SIGNIFICANT DIFFERENCE \n')
                f.write('\n')
                
#KS test for distribution of average intensities--------------------------------------------------------------------------

for i in range(num_models):
    for j in range(len(distance)):
                                    #specify names for input and output files
        output_file = f'HighResMIP/{general_model[i]}/kstest_avgintdist.txt'
        start = f'HighResMIP/{general_model[i]}/{distance[j]}'
        
        if distance[j] == 'na_basin':              
            tf_past = f'{start}/{model_p[i]}.NA.storms.nc'   
            tf_future = f'{start}/{model_f[i]}.NA.storms.nc'
        else:
            tf_past = f'{start}/{model_p[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            tf_future = f'{start}/{model_f[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
        
        avg_intdist_p = get_avg_intdist(tf_past)    #get first avg intensity distribution
        avg_intdist_f = get_avg_intdist(tf_future)  #get second avg intensity distribution
  
        for k in range(len(alt_hyp)):               #conduct 2 sample KS test and output results to text file
            statistic, pvalue = stats.ks_2samp(avg_intdist_p, avg_intdist_f, alternative=alt_hyp[k])  

            with open(output_file, 'a') as f:       #write to output file depending on result
                f.write(f'2 Sample KS test on the avg intensity distributions from the files (alt = {alt_hyp[k]}): \n')
                f.write(f'{tf_past}: num storms = {len(avg_intdist_p)} \n')
                f.write(f'{tf_future}: num storms = {len(avg_intdist_f)} \n')
                f.write(f'statistic = {statistic} \n')
                f.write(f'p value = {pvalue} \n')
                if pvalue < 0.05:
                    f.write('STATISTICALLY SIGNIFICANT DIFFERENCE \n')
                f.write('\n')    
            
#KS test on distribution of 6 hr consecutive translation speeds--------------------------------------------------------------            
            
for i in range(num_models):
    for j in range(len(distance)):
                                    #specify names for input and output files
        output_file = f'HighResMIP/{general_model[i]}/kstest_tsdist.txt'
        start = f'HighResMIP/{general_model[i]}/{distance[j]}'
        
        if distance[j] == 'na_basin':              
            tf_past = f'{start}/{model_p[i]}.NA.storms.nc'   
            tf_future = f'{start}/{model_f[i]}.NA.storms.nc'
        else:
            tf_past = f'{start}/{model_p[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            tf_future = f'{start}/{model_f[i]}.NA.landfalling.storms.{distance[j]}.buffer.pts.nc'
            
        ts_dist_p = get_6hr_tsdist(tf_past)     #get first ts distribution
        ts_dist_f = get_6hr_tsdist(tf_future)   #get second ts distribution
  
        for k in range(len(alt_hyp)):           #conduct 2 sample KS test and output results to text file
            statistic, pvalue = stats.ks_2samp(ts_dist_p, ts_dist_f, alternative=alt_hyp[k])  

            with open(output_file, 'a') as f:   #write to output file depending on result
                f.write(f'2 Sample KS test on the ts distributions from the files (alt = {alt_hyp[k]}): \n')
                f.write(f'{tf_past}: num data points = {len(ts_dist_p)} \n')
                f.write(f'{tf_future}: num data points = {len(ts_dist_f)} \n')
                f.write(f'statistic = {statistic} \n')
                f.write(f'p value = {pvalue} \n')
                if pvalue < 0.05:
                    f.write('STATISTICALLY SIGNIFICANT DIFFERENCE \n')
                f.write('\n')        

  term = B[j] * bin


### Program to plot distributions from two files to confirm results from a KS test

In [None]:
import numpy as np
import xarray as xr
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from tc_functions.calculations import get_max_intdist, get_all_intdist, get_avg_intdist

tf1 = 'REF.COMB.NA.storms.nc'
tf2 = 'RCP85.COMB.NA.storms.nc'

res1 = get_all_intdist(tf1)
res2 = get_all_intdist(tf2)

#print(max(res1), min(res1))
#print(max(res2), min(res2))

#statistic, pvalue = stats.ks_2samp(res1, res2, alternative='greater')
#print(statistic, pvalue)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12,7)) #initialize figure and axes

bin_intervals = np.arange(0, 110, 5)       #set bin intervals

ax.hist(res2, bins=bin_intervals, cumulative=False, density=True, histtype = 'step',
        color='orange', linewidth=2, label='RCP8.5')
ax.hist(res1, bins=bin_intervals, cumulative=False, density=True, histtype = 'step',
        color='b', linewidth=2, label='REF')
        
fontsize = 12
ax.set_title('REF/RCP8.5 all int dist (entire NA basin)', fontsize=fontsize)  #set specific titles for each axis
ax.set_ylabel('Probability', fontsize=fontsize)
ax.set_xlabel('Wind Speed (m/s)', fontsize=fontsize)
ax.legend(loc='upper left')

plt.savefig("A_NAtest_refrcp85.png")           #save and close figure
plt.close(fig)