# Upwelling Comparison - Season Duration

## This notebook looks at the length of the upwelling season in the Historical (1965-2005) and Future (2010-2050) time frames.

### Import the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.options.mode.chained_assignment = None

## Import necessary Data

### Coastal locations and model names

In [None]:
maskpoints=pd.read_excel('../Data Samples/Excels/maskpointsfile.xlsx')
Models = ['ACCESS','BCC-CSM', 'CCSM4', 'FGOALS', 'GFDL', 'IPSL', 'MIROC5', 'MPI', 'MRI', 'NorESM']

### Upwelling Intensity Data
I have taken the done some preprocessing of the data to speed up calculation times. You may either use Option 1 (Preprocessing Complete - No calculations needed, much faster) or Option 2 (Minor Preprocessing completed - Calculations still required, much slower) below to import the data.

##### Option 1: Import Pre-Calculated data

In [None]:
StartEndDates = pd.read_excel("../Data Samples/Excels/StartEndDates.xlsx")

##### Option 2: Calculations still required. 
This takes much longer than option 1, but it's helpful to read through the code to understand the calculations. The general shape of the daily averages for upwelling intensity are shaped with a peak in the middle of the year (See "Upwelling Comparisons - Daily Averages.ipynb" for visual). We define the upwelling season start date as the first point of inflection of upwelling intensity in the year, and the season end date as the second point of inflection in the year. We make some assumptions (the start date is not in January) to rule out data inconsistencies at the beginning of the year

In [None]:
StartEndDates = pd.DataFrame(columns=['Model', 'Lat','TimeFrame', 'StartDate', 'EndDate'])
indexnum=0
for Model in Models:
    print(Model)
    
    BS_df = pd.read_excel('../Data Samples/Excels/Daily Averages{}_BS_DailyAverages.xlsx'.format(Model))
    RCP_df = pd.read_excel('../Data Samples/Excels/Daily Averages/{}_RCP_DailyAverages.xlsx'.format(Model))
    for lat in BS_df.columns:
        BS_timeline = BS_df[[lat]]
        RCP_timeline = RCP_df[[lat]]
    
        
        BS_timeline['dates'] = BS_timeline.index + 1
        RCP_timeline['dates'] = RCP_timeline.index + 1
        
        i=0
        print('Calculating Dates....' + str(lat))
        try:
            BS_timeline['slope']= 0
            while i < len(BS_timeline.iloc[:, 0]):
                if i==0:
                    BS_timeline.loc[i, 'slope'] = 0
                    RCP_timeline.loc[i, 'slope'] = 0
                else:
                    BS_timeline.loc[i, 'slope'] = (BS_timeline.loc[i, lat] - BS_timeline.loc[i-1, lat])/2
                    RCP_timeline.loc[i, 'slope'] = (RCP_timeline.loc[i, lat] - RCP_timeline.loc[i-1, lat])/2  
                i+=1
            BS_timeline.loc[55:65, 'slope'] = 0
            RCP_timeline.loc[55:65, 'slope'] = 0
            peak = BS_timeline.where(BS_timeline.loc[:,lat] == max(BS_timeline.loc[:,lat])).dropna().loc[:,'dates'].iloc[0]
            BS_start = BS_timeline.where(BS_timeline.loc[30:peak,'slope'] == max(BS_timeline.loc[30:peak,'slope'])).dropna().loc[:,'dates'].iloc[0]
            BS_end = BS_timeline.where(BS_timeline.loc[peak:305,'slope'] == min(BS_timeline.loc[peak:305,'slope'])).dropna().loc[:,'dates'].iloc[0]
            StartEndDates.loc[indexnum,:]  = ['{}'.format(Model), lat, 'Historical', BS_start, BS_end]

            indexnum+=1
            peak = RCP_timeline.where(RCP_timeline.loc[:,lat] == max(RCP_timeline.loc[:, lat])).dropna().loc[:,'dates'].iloc[0]
            RCP_start = RCP_timeline.where(RCP_timeline.loc[30:peak,'slope'] == max(RCP_timeline.loc[30:peak,'slope'])).dropna().loc[:,'dates'].iloc[0]
            RCP_end = RCP_timeline.where(RCP_timeline.loc[peak:305,'slope'] == min(RCP_timeline.loc[peak:305,'slope'])).dropna().loc[:,'dates'].iloc[0]
            StartEndDates.loc[indexnum,:]  = ['{}'.format(Model), lat, 'Future', RCP_start, RCP_end]
            indexnum+=1
        except:
            print('ERROR: {}, {}'.format(Model, str(lat)))
            
        
StartEndDates.loc[:,'Lat'] = StartEndDates.loc[:,'Lat'].astype(float)
StartEndDates.loc[:,'StartDate'] = StartEndDates.loc[:,'StartDate'].astype(int)
StartEndDates.loc[:,'EndDate'] = StartEndDates.loc[:,'EndDate'].astype(int)


### Plotting
The following graphs show the start date, end date, and duration of the upwelling seasons across all models. These plots show the minimum, maximum, median and 1st and 3rd quartiles at each selected Coastal location.

In [None]:
palette = {'Historical': 'cornflowerblue', 'Future': 'firebrick'}
StartEndDates_models = StartEndDates.set_index('Model')
StartEndDates_models_grouped = StartEndDates_models.reset_index().set_index(['Lat', 'TimeFrame']).sort_index().reset_index()

### Start Date

In [None]:
fig = plt.figure(figsize=(20,10))
start_ax = fig.add_subplot(111)
startplot= sns.boxplot(x='Lat', y='StartDate', hue='TimeFrame', data=StartEndDates_models_grouped.round(1), ax=start_ax, palette=palette)
start_ax.set_ylabel('Day of Year', fontsize = 24.0)
start_ax.set_xlabel('', fontsize = 24)
start_ax.set_title('Start Date', size=24)
start_ax.tick_params(axis = 'both', which = 'major', labelsize = 24)

### End Date

In [None]:
fig = plt.figure(figsize=(20,10))
end_ax=fig.add_subplot(111)
endplot = sns.boxplot(x='Lat', y='EndDate', hue='TimeFrame', data=StartEndDates_models_grouped.round(1), ax=end_ax, palette=palette)
end_ax.set_ylabel('Day of Year', fontsize = 24.0)
end_ax.set_xlabel('', fontsize = 24)
end_ax.set_title('End Date', size=24)
end_ax.tick_params(axis = 'both', which = 'major', labelsize = 24)

### Season Duration

In [None]:
import matplotlib.pyplot as plt
sns.set()
StartEndDates['Duration'] = 365 - (365-StartEndDates['EndDate']) - StartEndDates['StartDate']

palette = {'Historical': 'cornflowerblue', 'Future': 'firebrick'}
StartEndDates_models = StartEndDates.set_index('Model')
StartEndDates_models_grouped = StartEndDates_models.reset_index().set_index(['Lat', 'TimeFrame']).sort_index().reset_index()
fig = plt.figure(figsize=(20,10))

dur_ax = fig.add_subplot(111)
durplot= sns.boxplot(x='Lat', y='Duration', hue='TimeFrame', data=StartEndDates_models_grouped.round(1), ax=dur_ax, palette=palette)
dur_ax.set_ylabel('Day of Year', fontsize = 24.0)
dur_ax.set_xlabel('', fontsize = 24)

dur_ax.set_title('Season Duration', size=24)
dur_ax.tick_params(axis = 'both', which = 'major', labelsize = 24)

In [None]:
import scipy.stats as stats
StartEndDates_models_grouped.reset_index(inplace=True)
StartEndDates_models_grouped.set_index(['Lat', 'TimeFrame'], inplace=True)

pvals={}
pvals['Start Date'] = {}
pvals['End Date'] = {}
pvals['Duration'] = {}
for lat, timeframe in StartEndDates_models_grouped.index:

    start_tstat, start_p = stats.ttest_ind(StartEndDates_models_grouped.loc[lat, 'Historical']['StartDate'], StartEndDates_models_grouped.loc[lat, 'Future']['StartDate'])
    difference = StartEndDates_models_grouped.loc[lat, 'Historical']['StartDate'].mean() -  StartEndDates_models_grouped.loc[lat, 'Future']['StartDate'].mean()
    pvals['Start Date'][lat] = {'Pvalue': start_p, 'Difference': difference}
    
    end_tstat, end_p = stats.ttest_ind(StartEndDates_models_grouped.loc[lat, 'Historical']['EndDate'], StartEndDates_models_grouped.loc[lat, 'Future']['EndDate'])
    difference = StartEndDates_models_grouped.loc[lat, 'Historical']['EndDate'].mean() -  StartEndDates_models_grouped.loc[lat, 'Future']['EndDate'].mean()
    pvals['End Date'][lat] = {'Pvalue': end_p, 'Difference': difference}
    
    duration_tstat, duration_p = stats.ttest_ind(StartEndDates_models_grouped.loc[lat, 'Historical']['Duration'], StartEndDates_models_grouped.loc[lat, 'Future']['Duration'])
    difference = StartEndDates_models_grouped.loc[lat, 'Historical']['Duration'].mean() -  StartEndDates_models_grouped.loc[lat, 'Future']['Duration'].mean()
    pvals['Duration'][lat] = {'Pvalue': duration_p, 'Difference': difference}
    
StartStats = pd.DataFrame.from_dict(pvals['Start Date'], orient='index')
EndStats = pd.DataFrame.from_dict(pvals['End Date'], orient='index')
DurationStats = pd.DataFrame.from_dict(pvals['Duration'], orient='index')

## Latitudes with Statistically Significant Difference in Start Date

In [None]:
for idx, columns in StartStats.iterrows():
    if columns['Pvalue']>.05:
        StartStats.drop(idx, inplace=True)
StartStats

## Latitudes with Statistically Significant Difference in End Date

In [None]:
for idx, columns in EndStats.iterrows():
    if columns['Pvalue']>.05:
        EndStats.drop(idx, inplace=True)
EndStats

## Latitudes with Statistically Significant Different Season Duration

In [None]:
for idx, columns in DurationStats.iterrows():
    if columns['Pvalue']>.05:
        DurationStats.drop(idx, inplace=True)
DurationStats

From these, we can see that the change in season overall tends to be at the higher latitudes in the study region (greater than 40 degrees Latitude). We also see changes in the start and end date along the coastline at 32.67 degrees latitude. This is in the region of the Mexican/American border, and is one of the lower latitudinal locations in our study.