In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import datetime

In [2]:
def pivot_table_hour(column_titles, df_to_pivot):
    """ Pivot subdaily obs dataframe so there is one row per day and a column for each time of observation
    
    Args:
        column_values (datetime list): time of observation that we are interested in
        df_to_pivot (dataframe): df containing the subdaily obs with a row for each observation.
    
    """
    
    appended_data = []
    
    #convert column titles from the historical climate data to datetimes (+30 mins to accounts for solar time)
    tobs = (pd.to_datetime(surveyo.columns,format="%H%M") + datetime.timedelta(minutes=30)).time
    
    for tstamp in tobs:
        hourly = ktown[ktown.index.time == tstamp]
        hourly.index = pd.DatetimeIndex(hourly.index).normalize()
        appended_data.append(hourly)
    pivoted = pd.concat(appended_data,axis=1)   
    pivoted.columns = column_titles
    
    return pivoted
    

In [3]:
def mlr(predictand,predictor,estimate):
    """Estimate tmax/tmin based on subdaily observations
    
    Args:
        predictand(dataframe): dataframe of subdaily observations
        predictor(dataframe): dataframe of tmax or tmin
        estimate(str): State whether you want to estimate tmax or tmin
    
    """
    
    predictand[estimate] = predictor[estimate] #build dataframe from subdaily obs and max/min daily temp
    
    predictand.dropna(inplace=True) #remove all that contain a single nan

    predictand = sm.add_constant(predictand) # adding a constant

    model = sm.OLS(predictand[estimate], predictand.loc[:, predictand.columns != estimate]).fit()

    return model


In [4]:
def mlr_qa_plots(MLRmodel,season):
    """ Produce a qq plot and residual vs fitted value plot to assess the quality of a MLR
    
    Args:
        MLRmodel (sm.OLS model): multiple linear regression model
        season: integer denoting the season
    """
    
    season_names = ['summer','autumn','winter','spring']
    
    f = plt.figure(figsize=(15,10))
    
    ax1 = f.add_subplot(223)
    
    sm.qqplot(MLRmodel.resid, stats.norm, fit=True, line="45",ax=ax1)
    ax1.tick_params(axis='both', which='major', labelsize=14)
    
    ax1.set_xlabel('Theoretical quantiles (normal dist.)', fontsize=16)
    ax1.set_ylabel('Quantiles of residuals', fontsize=16)
    
    ax2 = f.add_subplot(211)

    ax2.scatter(MLRmodel.fittedvalues,MLRmodel.resid)
    ax2.set_xlabel('Fitted value', fontsize=16)
    ax2.set_ylabel('Residual', fontsize=16)
    ax2.tick_params(axis='both', which='major', labelsize=14)
    ax2.set_ylim(-10,10)
    
    plt.title(season_names[season],fontsize=16)
    
    #plt.savefig('wyatt_' + season_names[season] + '_MLR_tmin_model.png',dpi=300)

In [5]:
def outlier(residuals):
    """ Calculates the threshold of an outlier according to the equation Q3 + 1.5*IQR
    
    Args:
        residuals: dataframe
        
    """
    outlier = residuals.quantile(0.75) + (stats.iqr(residuals) * 1.5)
    
    return outlier

In [13]:
#read in max min data
inpath = '/home/561/zb8411/Documents/data/observational/adelaide/temperature/'

#format sub hourly data
ktownMaxMin = pd.read_csv(inpath + 'kenttown_daily_1977-2020.csv', parse_dates=True, index_col=0)
ktown = pd.read_csv(inpath + 'kenttown_subhourly_1993-2020.csv', parse_dates=True, index_col=0)

surveyo = pd.read_csv(inpath + 'surveyoffice_subdaily_qc_1843-1856.csv', 
                      parse_dates=True, index_col=0, header=0)

surveyo = (surveyo - 32)*5/9

In [14]:
vble = ['max','min']; vcode=1

#run regression annually so I can find instances when the residuals are outliers
ktownPivot = pivot_table_hour(surveyo.columns, ktown)
mlrModel = mlr(ktownPivot,ktownMaxMin,vble[vcode])
ktownPivot = ktownPivot[mlrModel.resid < outlier(mlrModel.resid)]
ktownMaxMin['season'] = ktownMaxMin.index.month%12 // 3 + 1

#time of observation changes three times
vol = [['1843-04-01','1843-11-30'],['1843-12-01','1851-02-28'],['1852-03-01','1856-12-31']]

tmaxmaster = []
rsqmaster = []

for slice in vol:
    surveySubset = surveyo.loc[slice[0]:slice[1]]
    surveySubset = surveySubset.dropna(how='all',axis=1)
    ktownSubset = ktownPivot[surveySubset.columns.intersection(ktownPivot.columns)]
    
    surveySubset['season'] = surveySubset.index.month%12 // 3 + 1
    ktownSubset['season'] = ktownSubset.index.month%12 // 3 + 1
    
    tmaxpredict = []
    rsq = []
    
    for season in np.sort(surveySubset['season'].unique()):
        
        sdaily = ktownSubset[ktownSubset['season'] == season].loc[:, ktownSubset.columns != 'season']
        tmax = ktownMaxMin[ktownMaxMin['season'] == season].loc[:, ktownMaxMin.columns != 'season']
        
        #build regression model
        mlrModel = mlr(sdaily,tmax,vble[vcode])
        
        #produce regression QA plots
        #mlr_qa_plots(mlrModel,season-1)
        
        #store the rsquared from each regression model
        rsq.append(mlrModel.rsquared)
    
        #predict tmax from each seasonal regression
        tmaxpredict.append(mlrModel.predict(sm.add_constant(surveySubset[surveySubset['season'] == season].loc[:, surveySubset.columns != 'season'])))
        
             
        del sdaily,tmax,mlrModel
        
    tmaxpredict = pd.concat(tmaxpredict)
    tmaxmaster.append(tmaxpredict)
    rsqmaster.append(rsq)

surveyo[vble[vcode] + '_est'] = pd.concat(tmaxmaster)
surveyo[vble[vcode]] = surveyo.min(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ktownSubset['season'] = ktownSubset.index.month%12 // 3 + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ktownSubset['season'] = ktownSubset.index.month%12 // 3 + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ktownSubset['season'] = ktownSubset.index.month%12 // 3 + 1


In [15]:
surveyo

Unnamed: 0_level_0,1000,1200,1400,1600,1030,1530,1500,min_est,min
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1843-04-01,21.111111,23.333333,,24.444444,,,24.444444,13.118313,13.118313
1843-04-02,,,,,,,,,
1843-04-03,16.666667,19.166667,,19.722222,,,18.611111,10.804878,10.804878
1843-04-04,18.888889,21.666667,,23.888889,,,23.333333,11.577033,11.577033
1843-04-05,21.666667,22.222222,,26.666667,,,25.555556,14.221650,14.221650
...,...,...,...,...,...,...,...,...,...
1856-12-27,,,,,,,,,
1856-12-28,,,,,,,,,
1856-12-29,,,,,,,,,
1856-12-30,,,,,,,,,


In [9]:
mmax = surveyo.copy()

In [None]:
surveyo.to_csv('surveyoffice_subdaily_max_1843-1856.csv')

In [17]:
mmax['min_est'] = surveyo['min_est']
mmax['min'] = surveyo['min']

In [21]:
mmax.to_csv(inpath + 'surveyoffice_daily_qc_1843-1856.csv')

In [20]:
infile

NameError: name 'infile' is not defined