In [1]:
import os
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.linear_model import LinearRegression as lr
import datetime as dt
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
os.chdir('C:\\Users\\VanBuren\\Desktop\\Coursework\\Capstone\\Coding_analysis\\Test_Data')

In [13]:
#NYDEC CALIBRATION SCHEME
#----USEFUL FOR NON-SUBWAY CALIBRATIONS: INDOOR/OUTDOOR TIME.
#designate timespan we want
start = dt.datetime.strptime('2020-07-10', '%Y-%m-%d')
end = dt.datetime.strptime('2021-01-27', '%Y-%m-%d')

#Read in the NY DEC data, combine date/time columns into one datetime field for easier analysis
refdata = pd.read_csv('nydec.csv',header = 1,parse_dates=[['Date','Time']]).rename(columns={"PM25C_ug/m3LC":"PM2.5_ref"})

#Localize refdata to UTC (then remove timezone awareness)
refdata['Date_Time'] = refdata['Date_Time'].dt.tz_localize('US/Eastern', ambiguous = 'NaT').dt.tz_convert('UTC').dt.tz_localize(None)
refdata = refdata.loc[(refdata['Date_Time'] >= start) & ((refdata['Date_Time'] < end))]
refdata = refdata.set_index(refdata.Date_Time)

#read in purpleair datasets: hour averages (for best comparison to refdata data)
#then limit range of PA sensor data to desired timespan
#then set index to timestamp

sensA = pd.read_csv('school_4_A_hr.csv',header = 0).rename(columns={"Temperature_F":"Temp","PM2.5_ATM_ug/m3":"PM2.5A"})
#remove localization
sensA['created_at'] = pd.to_datetime(sensA['created_at']).dt.tz_localize(None)
sensA = sensA.loc[(sensA['created_at'] >= start) & ((sensA['created_at'] < end))]
sensA = sensA.set_index(sensA.created_at)

sensB = pd.read_csv('school_4_B_hr.csv',header = 0).rename(columns={"PM2.5_ATM_ug/m3":"PM2.5B"})
sensB['created_at'] = pd.to_datetime(sensB['created_at']).dt.tz_localize(None)
sensB = sensB.loc[(sensB['created_at'] >= start) & ((sensB['created_at'] < end))]
sensB = sensB.set_index(sensB.created_at)

#Get all info onto a single dataframe
df = pd.concat([sensA[['Temp','Humidity_%','PM2.5A']], sensB['PM2.5B'], refdata['PM2.5_ref']], axis = 1)
df['PM2.5_test'] = df[['PM2.5A','PM2.5B']].mean(axis = 1)
#Null values gonna screw up statswork later on, so let's ditch now
df = df.loc[df['PM2.5_ref'].notnull() & df['PM2.5_test'].notnull() & df['Temp'].notnull() & df['Humidity_%'].notnull()]

#statswork for multiple regression
#Parameters for regression in X, assume they're predictors for nydec data as y

#trained model: nydec = x0 + x1[temp] + x2[humidity] + x3[PA data]

X = np.array(df[['Temp','Humidity_%','PM2.5_test']])
y = np.array(df['PM2.5_ref'])
reg = lr(fit_intercept = False).fit(X,y)

reg_out = lambda o: np.dot(o,reg.coef_)

0.5328656567210689


In [7]:
#CALIBRATION SCHEME B
#For calibrating data collected underground, based on gravimetric filters
filt = pd.read_csv('Gravcal.csv',header = 0)


X = np.column_stack([np.zeros(len(filt['PA PM2.5 avg'])),filt['PA PM2.5 avg']])
y = np.array(filt['Grav(average)'])
calibs = lr(fit_intercept = False).fit(X,y)
reg_plat = lambda p: calibs.coef_[1]*p

In [8]:
# Calibration Scheme C
# For dealing with PA and aboveground trains:
reg_sir = lambda q: q*(30.9/36.2)


In [5]:
# How's about offloading calibrations to just one function
def calibrate(frame):
    #Establish which code gets calibrated how
    codes_nydec = ['O','I','F']
    codes_luglio = ['PU','T','E']
    codes_sir = ['PA']
    #then calibrate based on Loc_code
    frame['PM2.5_calib'] = np.nan
    
    #nydec calibration
    frame['PM2.5_calib'].loc[frame['Loc_code'].isin(codes_nydec)] = reg_out(
        frame[['Temp','Humidity_%','PM2.5']].loc[frame['Loc_code'].isin(codes_nydec)])
    #Filter calibration
    frame['PM2.5_calib'].loc[frame['Loc_code'].isin(codes_luglio)] = reg_plat(
        frame['PM2.5'].loc[frame['Loc_code'].isin(codes_luglio)])
    #SIR calibration for aboveground plats
    frame['PM2.5_calib'].loc[frame['Loc_code'].isin(codes_sir)] = reg_sir(
        frame['PM2.5'].loc[frame['Loc_code'].isin(codes_sir)])
    #Special aboveground train calibration using SIR
    frame['PM2.5_calib'].loc[(frame['Loc_code']=='T') & (frame['Outside? (Y/N)'] == 'Y')] = reg_sir(
        frame['PM2.5'].loc[(frame['Loc_code']=='T') & (frame['Outside? (Y/N)'] == 'Y')])
    return frame


In [8]:
THen to run the calibration on the raw data
dfsub = pd.read_csv('Master_compiled_BB.csv',header = 0, parse_dates = [['Date', 'Time_NYC']]).rename(
    columns = {'Temperature (F)': 'Temp','Humidity (%)':'Humidity_%','PA PM2.5 (um/m3)':'PM2.5'})

dfsub = dfsub.loc[dfsub['PM2.5'].notnull() & dfsub['Temp'].notnull() & dfsub['Humidity_%'].notnull() & 
                  dfsub['Location'].notnull()]
dfsub = calibrate(dfsub)
# dfsub.loc[(dfsub['Loc_code']=='T') & (dfsub['Outside? (Y/N)'] == 'Y')]
dfsub.to_csv('Masterlist.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
