In [294]:
import requests
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [147]:
url = 'https://www.ncei.noaa.gov/access/services/data/v1?\
dataset=daily-summaries&\
dataTypes=TAVG,TMAX,TMIN,PRCP,SNOW,TAVG,AWND&\
stations=USC00288878,USW00013724,USC00287865,USC00285728,\
USC00286177,USC00286775,USC00281280,USC00283181,USC00283951,USC00280907,\
USC00282644,USW00014734,USC00286053,USC00282340,USC00281028,USC00284653,\
USC00280729,USC00288194,USC00287079,USC00288816,USC00283662,\
USC00288644,USC00289135,USC00285576,USC00281582,USC00281351,USC00286062,\
USC00281708,USC00283029,USC00288899,USC00287131,USC00284735,USC00289910,\
USC00284229,USC00287587,USC00286843,USC00282130,USC00281343,USC00286974,\
USC00281590,USC00284887,USC00287869,USC00284339,USC00287825,\
USC00281211,USC00284987,USC00286320,USW00014773,USC00287545,USC00286146&\
startDate=1880-01-01&\
endDate=1930-12-31&\
format=json&\
boundingbox=41.357423,-75.559614,38.928519,-73.893979'
resp = requests.get(url)


In [148]:
len(resp.json())

453990

In [149]:
resp.json()

[{'DATE': '1880-01-01',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '   67',
  'TMIN': '  -22',
  'PRCP': '    0'},
 {'DATE': '1880-01-02',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '   72',
  'TMIN': '   44',
  'PRCP': '    0'},
 {'DATE': '1880-01-03',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '   72',
  'TMIN': '   11',
  'PRCP': '    0'},
 {'DATE': '1880-01-04',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '  100',
  'TMIN': '   17',
  'PRCP': '    0'},
 {'DATE': '1880-01-05',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '  100',
  'TMIN': '   72',
  'PRCP': '    0'},
 {'DATE': '1880-01-06',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '   89',
  'TMIN': '   44',
  'PRCP': '  178'},
 {'DATE': '1880-01-07',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '  117',
  'TMIN': '   44',
  'PRCP': '    0'},
 {'DATE': '1880-01-08',
  'STATION': 'USC00288878',
  'SNOW': '    0',
  'TMAX': '   72',
  'TMIN

In [221]:
#Response into dataframe
df = pd.DataFrame(resp.json())
#Alter object to float
df = df.astype({'TMAX': float,
                'TMIN': float,
                'PRCP': float,
                'SNOW': float})

#Remove values outside of max recorded high/low
high_cutoff = df[(df['TMAX'] > 110) | (df['TMAX'] < -34)].index
df.drop(index = high_cutoff, inplace = True)
low_cutoff = df[(df['TMIN'] < -34) | (df['TMIN'] > 110)].index
df.drop(index = low_cutoff, inplace = True)
#Remove rows missing max or min temperatures
temp_missing = df[(df['TMAX'].isna()) | (df['TMIN'].isna())].index
df.drop(index = temp_missing, inplace = True)

#Drop snow and station columns
df.drop(columns = ['SNOW', 'STATION'], inplace = True)

#Dummy precipitaation column
df['PRCP'] = np.where(df['PRCP'] > 0, 1, 0)

#Convert date to datetime type
df['DATE'] = pd.to_datetime(df['DATE'])
#Create month/day columns
df['month'] = pd.DatetimeIndex(df['DATE']).month
df['day'] = pd.DatetimeIndex(df['DATE']).day

#Dummy seasons
df['winter'] = np.where(df['month'].isin([12,1,2]), 1, 0)
df['spring'] = np.where(df['month'].isin([3,4,5]), 1, 0)
df['summer'] = np.where(df['month'].isin([6,7,8]), 1, 0)
df['fall'] = np.where(df['month'].isin([9,10,11]), 1, 0)

In [285]:
#Split data dependent/independent
X = df.drop(['DATE', 'TMAX'], axis = 1)
y = df['TMAX']
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=100)

In [293]:
#Run basic model
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_train)
baseline_rmse = mean_squared_error(y_train, y_pred_baseline, squared = False)
#Run cross validation
baseline_cv = cross_val_score(baseline_model,
                              X_train,
                              y_train,
                              scoring="neg_root_mean_squared_error")
baseline_cv_rmse = -(baseline_cv.mean())

baseline_rmse, baseline_cv_rmse

(22.582978152766756, 22.58659940753167)