# Recent Weather Extract (Version 1.0)

**Ying Zhou**

In [117]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import calendar
import pickle
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import holidays

In [2]:
def intize(string):
    try:
        num = int(string)
        return num
    except ValueError as e:
        return None
def floatize(string):
    try:
        num = float(string)
        return num
    except ValueError as e:
        return None    

In [3]:
def sum_with_null(string_list):
    res = 0.0
    for string in string_list:
        nullable_num = floatize(string)
        if isinstance(nullable_num, float):
            res = res + nullable_num
    return res
def max_with_null(string_list):
    res = None
    is_nonnull = False
    for string in string_list:
        nullable_num = intize(string)
        if isinstance(nullable_num, int):
            if not is_nonnull:
                res = nullable_num
                is_nonnull = True
            else:
                if res < nullable_num:
                    res = nullable_num
    return res
def min_with_null(string_list):
    res = None
    is_nonnull = False
    for string in string_list:
        nullable_num = intize(string)
        if isinstance(nullable_num, int):
            if not is_nonnull:
                res = nullable_num
                is_nonnull = True
            else:
                if res > nullable_num:
                    res = nullable_num
    return res
def wind_speed_extractor(string):
    str_list = string.split()
    for str_ in str_list:
        try:
            num = int(str_)
            return num
        except ValueError as e:
            continue
    return 0#No number
def ave_wind_speed_extractor(string_list):
    leng = len(string_list)
    return sum(list(map(wind_speed_extractor, string_list)))/leng
def snow_amount_extractor(desc_list, prec_list):
    len1 = len(desc_list)
    len2 = len(prec_list)
    if len1 != len2:
        raise ValueError(f'{desc_list} and {prec_list} have different sizes!')
    snow_total = 0.0
    for ind in range(len1):
        if 'Snow' in desc_list[ind] or 'snow' in desc_list[ind]:
            snow_total = snow_total + floatize(prec_list[ind])
    return snow_total

In [97]:
def newest_day_weather(loc = 'KBOS'):
    recent_weather_url = 'https://w1.weather.gov/data/obhistory/' + loc + '.html'
    rw_req = requests.get(recent_weather_url)
    rw_req.raise_for_status()
    if rw_req.status_code == requests.codes.ok:
        soup = BeautifulSoup(rw_req.text, 'html.parser')
        table = soup.find_all('table')[3]
        rows = table.find_all('tr')
        num_rows = len(rows)
        data_dic_list = []
        for ind in range(3, num_rows):#Skip the first three rows
            row = rows[ind]
            pre_data_list = row.find_all('td')
            data_list = [pre_data.text for pre_data in pre_data_list]
            if len(data_list) == 0:
                break
            data_dic = {'day':data_list[0],'hour':data_list[1][:2],'wind':data_list[2], 'description': data_list[4], 'six_max':data_list[8], 'six_min':data_list[9], 'one_prec': data_list[15]}
            data_dic_list.append(data_dic)
        df_recent_weather = pd.DataFrame(data_dic_list)
        day = df_recent_weather.iat[0,df_recent_weather.columns.get_loc('day')]
        last_hour = df_recent_weather.iat[0,df_recent_weather.columns.get_loc('hour')]
        temp_dic = {'day':int(day)}
        df_temp_day = df_recent_weather[df_recent_weather.day == day]
        #print(df_temp_day.dtypes)
        temp_dic['PRCP'] = sum_with_null(df_temp_day['one_prec'].tolist())
        temp_dic['TMAX'] = max_with_null(df_temp_day['six_max'].tolist())
        temp_dic['TMIN'] = min_with_null(df_temp_day['six_min'].tolist())
        temp_dic['AWND'] = ave_wind_speed_extractor(df_temp_day['wind'].tolist())
        temp_dic['SNOW'] = snow_amount_extractor(df_temp_day['description'].tolist(),df_temp_day['one_prec'].tolist())
        temp_dic['last_hour'] = int(last_hour)
        return temp_dic

In [135]:
newest_day_weather()

{'day': 3,
 'PRCP': 0.0,
 'TMAX': 76,
 'TMIN': 67,
 'AWND': 7.166666666666667,
 'SNOW': 0.0,
 'last_hour': 11}

# Extract Forecast

In [105]:
def earliest_day_forecast():
    weather_forecast_url = 'https://forecast.weather.gov/MapClick.php?lat=42.3587&lon=-71.0567&unit=0&lg=english&FcstType=digital'
    req = requests.get(weather_forecast_url)
    req.raise_for_status()
    if req.status_code == requests.codes.ok:
        soup = BeautifulSoup(req.text, 'html.parser')
        table = soup.find_all('table')[7]
        rows = table.find_all('tr')
        list_list = []
        for row in rows:
            eles = row.find_all('td')
            eles = [ele.text for ele in eles]
            list_list.append(eles)
        del list_list[0]
        df_weather_forecast = pd.DataFrame(list_list)
        df_weather_forecast = df_weather_forecast.transpose()
        headers = df_weather_forecast.iloc[0]
        df_weather_forecast = df_weather_forecast[1:]
        df_weather_forecast.columns = headers
        df_weather_forecast['Hour (EDT)'] = df_weather_forecast['Hour (EDT)'].astype(int)
        min_ind = df_weather_forecast['Hour (EDT)'].idxmin()
        min_loc = df_weather_forecast.index.get_loc(min_ind)
        if min_loc == 0:
            df_weather_needed = df_weather_forecast
        else:
            df_weather_needed = df_weather_forecast[:min_loc]
        df_weather_needed = df_weather_needed[['Date', 'Hour (EDT)', 'Temperature (°F)', 'Surface Wind (mph)', 'Rain']]
        forecast_dic = {}
        forecast_dic['month'] = int(df_weather_needed.iat[0,df_weather_needed.columns.get_loc('Date')][:2])
        forecast_dic['day'] = int(df_weather_needed.iat[0,df_weather_needed.columns.get_loc('Date')][3:])
        forecast_dic['first_hour'] = df_weather_needed.iat[0,df_weather_needed.columns.get_loc('Hour (EDT)')]
        df_weather_needed['Temperature (°F)'] = df_weather_needed['Temperature (°F)'].astype(float)
        temperature_list = df_weather_needed['Temperature (°F)'].tolist()
        forecast_dic['TMAX'] = max(temperature_list)
        forecast_dic['TMIN'] = min(temperature_list)
        df_weather_needed['Surface Wind (mph)'] = df_weather_needed['Surface Wind (mph)'].astype(float)
        forecast_dic['AWND'] = df_weather_needed['Surface Wind (mph)'].mean()
        forecast_dic['PRCP'] = 0.0 #Need to figure out how to make it nonzero
        forecast_dic['SNOW'] = 0.0 #Need to figure out how to make it nonzero
        return forecast_dic

In [134]:
earliest_day_forecast()

{'month': 8,
 'day': 3,
 'first_hour': 12,
 'TMAX': 86.0,
 'TMIN': 75.0,
 'AWND': 10.166666666666666,
 'PRCP': 0.0,
 'SNOW': 0.0}

# Testing zone

In [118]:
#Extract data from Fred and return a dataframe
def extract_fred_data(series_id = 'MAURN', api_key = '007198ec987cc488277fcc2b0984d47d', start_time = '2012-07-01'):
    url = 'https://api.stlouisfed.org/fred/series/observations?series_id=' + series_id + '&api_key=' + api_key + '&file_type=json' + '&observation_start=' + start_time
    req = requests.get(url)
    req.raise_for_status()
    if req.status_code == requests.codes.ok:
        data_json = json.loads(req.text)
        data_list = data_json['observations']
        df = pd.DataFrame(data_list)
        del df['realtime_start']
        del df['realtime_end']
        df.rename(columns = {'value':series_id},inplace = True)
        df['year'] = df['date'].apply(lambda x: int(x[:4]))
        df['month'] = df['date'].apply(lambda x: int(x[5:7]))
        del df['date']
        df['year'] = df.year.astype('category')
        df['month'] = df.month.astype('category')
        return df

In [139]:
def get_holiday(year, month, day):
    dt_obj = datetime.datetime(year, month, day)
    ma_holidays = holidays.CountryHoliday('US', prov=None, state='MA')
    hol_list = ["New Year's Day", 'Memorial Day', 'Independence Day', 'Labor Day', 'Thanksgiving', 'Christmas Day']
    if month == 12 and day >= 27 and day <= 30:
        return 'Holiday Season'
    holiday = ma_holidays.get(dt_obj)
    if holiday in hol_list:
        return str(holiday)
    dt_before_obj = dt_obj - datetime.timedelta(days=1)
    holiday = ma_holidays.get(dt_before_obj)
    if holiday in hol_list:
        return 'Post-' + str(holiday)
    dt_after_obj = dt_obj + datetime.timedelta(days=1)
    holiday = ma_holidays.get(dt_after_obj)
    if holiday in hol_list:
        return str(holiday) + ' Eve'
    return 'None'

In [196]:
def make_data(ser_id = 'MAURN', crime_list = ['AGGRAVATED ASSAULT','AUTO THEFT','COMMERCIAL BURGLARY','HOMICIDE','LARCENY','OTHER BURGLARY','RESIDENTIAL BURGLARY','ROBBERY']):
    ndw = newest_day_weather()
    edf = earliest_day_forecast()
    res = {}
    if ndw['day'] != edf['day']:#Only edf counts then
        res = edf
        res['SNOW'] = 0.0 #We haven't figured out how to get snow info from forecast data yet
    else:
        res['month'] = edf['month']
        res['day'] = edf['day']
        if ndw['TMAX'] is None:
            res['TMAX'] = edf['TMAX']
        else:
            res['TMAX'] = max(edf['TMAX'], ndw['TMAX'])
        if ndw['TMIN'] is None:
            res['TMIN'] = edf['TMIN']
        else:
            res['TMIN'] = min(edf['TMIN'], ndw['TMIN'])
        res['PRCP'] = edf['PRCP'] + ndw['PRCP']
        res['SNOW'] = edf['SNOW'] + ndw['SNOW']
        res['AWND'] = ((24 - edf['first_hour']) * edf['AWND'] + (1 + ndw['last_hour']) * ndw['AWND']) / 24
    #Now process year and dayw
    now = datetime.datetime.now()
    local_month = now.month
    local_year = now.year
    if local_month == 1 and res['month'] == 12:#Need to move one day back
        res['year'] = local_year - 1
    elif local_month == 12 and res['month'] == 1:#Need to move one day forward
        res['year'] = local_year + 1
    else:
        res['year'] = local_year
    day = datetime.datetime(res['year'],res['month'],res['day'])
    res['dayw'] = calendar.day_name[day.weekday()]
    series_id = ser_id
    #Now process unemployment rate
    df_ue = extract_fred_data(series_id = series_id)
    series_id_index = df_ue.columns.get_loc(series_id)
    df_ue_temp = df_ue.sort_values(by = ['year', 'month'])
    size = df_ue_temp.shape[0]
    res[series_id] = float(df_ue_temp.iat[size - 1, series_id_index])
    #Now process holiday
    res['HOLIDAY'] = get_holiday(res['year'], res['month'], res['day'])
    #Time to convert everything to a dataframe
    df_res = pd.DataFrame(res, index = [0])
    df_res['year'] = df_res['year'].astype('category')
    df_res['month'] = df_res['month'].astype('category')
    df_res['day'] = df_res['day'].astype('category')
    df_res['dayw'] = df_res['dayw'].astype('category')
    df_res['HOLIDAY'] = df_res['HOLIDAY'].astype('category')
    #Expand according to crime
    short_col_list = df_res.columns
    df_final = pd.DataFrame(np.repeat(df_res.values, len(crime_list), axis = 0), columns = short_col_list)
    df_final['crime'] = pd.Series(1 * crime_list)
    #Time to get dummies
    col_list = ['AWND',
 'PRCP',
 'SNOW',
 'TMAX',
 'TMIN',
 ser_id,
 'crime_AGGRAVATED ASSAULT',
 'crime_AUTO THEFT',
 'crime_COMMERCIAL BURGLARY',
 'crime_HOMICIDE',
 'crime_LARCENY',
 'crime_OTHER BURGLARY',
 'crime_RESIDENTIAL BURGLARY',
 'crime_ROBBERY',
 'year_2012',
 'year_2013',
 'year_2014',
 'year_2015',
 'year_2016',
 'year_2017',
 'year_2018',
 'year_2019',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'day_7',
 'day_8',
 'day_9',
 'day_10',
 'day_11',
 'day_12',
 'day_13',
 'day_14',
 'day_15',
 'day_16',
 'day_17',
 'day_18',
 'day_19',
 'day_20',
 'day_21',
 'day_22',
 'day_23',
 'day_24',
 'day_25',
 'day_26',
 'day_27',
 'day_28',
 'day_29',
 'day_30',
 'day_31',
 'dayw_Friday',
 'dayw_Monday',
 'dayw_Saturday',
 'dayw_Sunday',
 'dayw_Thursday',
 'dayw_Tuesday',
 'dayw_Wednesday',
 'HOLIDAY_Christmas Day',
 'HOLIDAY_Christmas Day Eve',
 'HOLIDAY_Holiday Season',
 'HOLIDAY_Independence Day',
 'HOLIDAY_Independence Day Eve',
 'HOLIDAY_Labor Day',
 'HOLIDAY_Labor Day Eve',
 'HOLIDAY_Memorial Day',
 'HOLIDAY_Memorial Day Eve',
 "HOLIDAY_New Year's Day",
 "HOLIDAY_New Year's Day Eve",
 'HOLIDAY_Post-Christmas Day',
 'HOLIDAY_Post-Independence Day',
 'HOLIDAY_Post-Labor Day',
 'HOLIDAY_Post-Memorial Day',
 "HOLIDAY_Post-New Year's Day",
 'HOLIDAY_Post-Thanksgiving',
 'HOLIDAY_Thanksgiving',
 'HOLIDAY_Thanksgiving Eve','HOLIDAY_None']
    df_final['TMAX'] = df_final['TMAX'].astype('float64')
    df_final['TMIN'] = df_final['TMIN'].astype('float64')
    df_final['PRCP'] = df_final['PRCP'].astype('float64')
    df_final['SNOW'] = df_final['SNOW'].astype('float64')
    df_final['AWND'] = df_final['AWND'].astype('float64')
    df_final[ser_id] = df_final[ser_id].astype('float64')
    df_final_dummies = pd.get_dummies(df_final)
    num_rows = df_final_dummies.shape[0]
    #print(df_final_dummies)
    for col in col_list:
        if col not in df_final_dummies.columns:
            df_final_dummies[col] = np.zeros(num_rows)
    df_final_dummies = df_final_dummies[col_list]    
    return df_final_dummies

In [203]:
def predict_today_crimes(crime_list = ['AGGRAVATED ASSAULT','AUTO THEFT','COMMERCIAL BURGLARY','HOMICIDE','LARCENY','OTHER BURGLARY','RESIDENTIAL BURGLARY','ROBBERY']):
    final_X = make_data().values
    lgbm_final = pickle.load(open('/Users/CatLover/Documents/DataScience/BostonCrime/lgbm_reg.p','rb'))
    scaler_final = pickle.load(open('/Users/CatLover/Documents/DataScience/BostonCrime/lgbm_scaler.p','rb'))
    final_X_scaled = scaler_final.transform(final_X)
    final_y = lgbm_final.predict(final_X_scaled, num_iteration=lgbm_final.best_iteration_)
    crime_types = len(crime_list)
    prediction_dic = {}
    for i in range(crime_types):
        prediction_dic[crime_list[i]] = final_y[i]
    return prediction_dic
    

In [204]:
predict_today_crimes()

{'AGGRAVATED ASSAULT': 7.500128216822966,
 'AUTO THEFT': 3.9602153115141325,
 'COMMERCIAL BURGLARY': 1.2051105942094407,
 'HOMICIDE': 0.29366682226790214,
 'LARCENY': 30.674761526340387,
 'OTHER BURGLARY': 0.3852131857111051,
 'RESIDENTIAL BURGLARY': 3.672264873645193,
 'ROBBERY': 3.1820221119315506}

In [197]:
test_X = make_data().values

In [199]:
lgbm_final = pickle.load(open('/Users/CatLover/Documents/DataScience/BostonCrime/lgbm_reg.p','rb'))
scaler_final = pickle.load(open('/Users/CatLover/Documents/DataScience/BostonCrime/lgbm_scaler.p','rb'))

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [200]:
test_X_scaled = scaler_final.transform(test_X)
test_y = lgbm_final.predict(test_X_scaled, num_iteration=lgbm_final.best_iteration_)

In [202]:
test_y

array([ 7.50012822,  3.96021531,  1.20511059,  0.29366682, 30.67476153,
        0.38521319,  3.67226487,  3.18202211])

# Trash

In [193]:
res_data = make_data()

   TMAX  TMIN  PRCP  SNOW   AWND  MAURN  month_8  day_3  year_2019  \
0  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
1  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
2  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
3  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
4  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
5  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
6  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   
7  86.0  67.0   0.0   0.0  8.875    3.1        1      1          1   

   dayw_Saturday  HOLIDAY_None  crime_AGGRAVATED ASSAULT  crime_AUTO THEFT  \
0              1             1                         1                 0   
1              1             1                         0                 1   
2              1             1                         0                 0   
3              1             1                         0 

In [194]:
res_data.dtypes

AWND                             float64
PRCP                             float64
SNOW                             float64
TMAX                             float64
TMIN                             float64
MAURN                            float64
crime_AGGRAVATED ASSAULT           uint8
crime_AUTO THEFT                   uint8
crime_COMMERCIAL BURGLARY          uint8
crime_HOMICIDE                     uint8
crime_LARCENY                      uint8
crime_OTHER BURGLARY               uint8
crime_RESIDENTIAL BURGLARY         uint8
crime_ROBBERY                      uint8
year_2012                        float64
year_2013                        float64
year_2014                        float64
year_2015                        float64
year_2016                        float64
year_2017                        float64
year_2018                        float64
year_2019                          uint8
month_1                          float64
month_2                          float64
month_3         

In [182]:
res_data

Unnamed: 0,month_8,day_3,TMAX_86.0,TMIN_67,PRCP_0.0,SNOW_0.0,AWND_8.875,year_2019,dayw_Saturday,MAURN_3.1,HOLIDAY_None,crime_AGGRAVATED ASSAULT,crime_AUTO THEFT,crime_COMMERCIAL BURGLARY,crime_HOMICIDE,crime_LARCENY,crime_OTHER BURGLARY,crime_RESIDENTIAL BURGLARY,crime_ROBBERY
0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0
5,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,0
6,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,0
7,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1


In [138]:
datetime.datetime(2019,8,3) - datetime.timedelta(days = 1)

datetime.datetime(2019, 8, 2, 0, 0)

In [149]:
get_holiday(2019,7,5)

'Post-Independence Day'

# Trash

In [115]:
now = datetime.datetime.now()
local_month = now.month
local_year = now.year
if local_month == 1 and res_data['month'] == 12:#Need to move one day back
    res_data['year'] = local_year - 1
elif local_month == 12 and res_data['month'] == 1:#Need to move one day forward
    res_data['year'] = local_year + 1
else:
    res_data['year'] = local_year
day = datetime.datetime(res_data['year'],res_data['month'],res_data['day'])
res_data['dayw'] = calendar.day_name[day.weekday()]

In [116]:
res_data

{'month': 8,
 'day': 3,
 'TMAX': 86.0,
 'TMIN': 67,
 'PRCP': 0.0,
 'SNOW': 0.0,
 'AWND': 8.708333333333334,
 'year': 2019,
 'dayw': 'Saturday'}

In [121]:
series_id = 'MAURN'
df_ue = extract_fred_data(series_id = series_id)

In [124]:
series_id_index = df_ue.columns.get_loc(series_id)
df_ue_temp = df_ue.sort_values(by = ['year', 'month'])
size = df_ue_temp.shape[0]
res_data[series_id] = float(df_ue_temp.iat[size - 1, series_id_index])

In [125]:
res_data

{'month': 8,
 'day': 3,
 'TMAX': 86.0,
 'TMIN': 67,
 'PRCP': 0.0,
 'SNOW': 0.0,
 'AWND': 8.708333333333334,
 'year': 2019,
 'dayw': 'Saturday',
 'MAURN': 3.1}

In [None]:
import holidays

# Trash

In [62]:
weather_forecast_url = 'https://forecast.weather.gov/MapClick.php?lat=42.3587&lon=-71.0567&unit=0&lg=english&FcstType=digital'
req = requests.get(weather_forecast_url)
req.raise_for_status()
if req.status_code == requests.codes.ok:
    soup = BeautifulSoup(req.text, 'html.parser')

In [63]:
table = soup.find_all('table')[7]

In [64]:
rows = table.find_all('tr')
list_list = []
for row in rows:
    eles = row.find_all('td')
    eles = [ele.text for ele in eles]
    list_list.append(eles)
del list_list[0]

In [65]:
df_weather_forecast = pd.DataFrame(list_list)

In [66]:
df_weather_forecast = df_weather_forecast.transpose()

In [67]:
headers = df_weather_forecast.iloc[0]
df_weather_forecast = df_weather_forecast[1:]
df_weather_forecast.columns = headers

In [68]:
headers

0                            Date
1                      Hour (EDT)
2                Temperature (°F)
3                   Dewpoint (°F)
4                 Heat Index (°F)
5              Surface Wind (mph)
6                        Wind Dir
7                            Gust
8                   Sky Cover (%)
9     Precipitation Potential (%)
10          Relative Humidity (%)
11                           Rain
12                        Thunder
Name: 0, dtype: object

In [69]:
df_weather_forecast['Hour (EDT)'] = df_weather_forecast['Hour (EDT)'].astype(int)

In [70]:
min_ind = df_weather_forecast['Hour (EDT)'].idxmin()

In [71]:
min_loc = df_weather_forecast.index.get_loc(min_ind)

In [73]:
df_weather_forecast

Unnamed: 0,Date,Hour (EDT),Temperature (°F),Dewpoint (°F),Heat Index (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder
1,08/03,0,73,61,,6,SW,,13,2,66,--,--
2,,1,73,61,,6,SW,,16,2,66,--,--
3,,2,72,62,,5,SW,,20,3,71,--,--
4,,3,71,63,,5,SW,,29,3,76,--,--
5,,4,70,64,,5,SW,,21,3,81,--,--
6,,5,70,63,,5,SW,,30,3,78,--,--
7,,6,71,64,,6,SW,,57,5,79,--,--
8,,7,73,64,,6,SW,,58,7,73,--,--
9,,8,75,65,75.0,7,SW,,65,8,71,--,--
10,,9,77,65,77.0,7,SW,,62,12,67,--,--


In [74]:
if min_loc == 0:
    df_weather_needed = df_weather_forecast
else:
    df_weather_needed = df_weather_forecast[:min_loc]

In [75]:
df_weather_needed.columns

Index(['Date', 'Hour (EDT)', 'Temperature (°F)', 'Dewpoint (°F)',
       'Heat Index (°F)', 'Surface Wind (mph)', 'Wind Dir', 'Gust',
       'Sky Cover (%)', 'Precipitation Potential (%)', 'Relative Humidity (%)',
       'Rain', 'Thunder'],
      dtype='object', name=0)

In [76]:
df_weather_needed = df_weather_needed[['Date', 'Hour (EDT)', 'Temperature (°F)', 'Surface Wind (mph)', 'Rain']]

In [77]:
df_weather_needed

Unnamed: 0,Date,Hour (EDT),Temperature (°F),Surface Wind (mph),Rain
1,08/03,0,73,6,--
2,,1,73,6,--
3,,2,72,5,--
4,,3,71,5,--
5,,4,70,5,--
6,,5,70,5,--
7,,6,71,6,--
8,,7,73,6,--
9,,8,75,7,--
10,,9,77,7,--


In [85]:
forecast_dic = {}
forecast_dic['day'] = int(df_weather_needed.iat[0,df_weather_needed.columns.get_loc('Date')][3:])
forecast_dic['first_hour'] = df_weather_needed.iat[0,df_weather_needed.columns.get_loc('Hour (EDT)')]
df_weather_needed['Temperature (°F)'] = df_weather_needed['Temperature (°F)'].astype(float)
temperature_list = df_weather_needed['Temperature (°F)'].tolist()
forecast_dic['TMAX'] = max(temperature_list)
forecast_dic['TMIN'] = min(temperature_list)
df_weather_needed['Surface Wind (mph)'] = df_weather_needed['Surface Wind (mph)'].astype(float)
forecast_dic['AWND'] = df_weather_needed['Surface Wind (mph)'].mean()
forecast_dic['PRCP'] = 0.0 #Need to figure out how to make it nonzero

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [82]:
df_weather_needed['Surface Wind (mph)']

1      6
2      6
3      5
4      5
5      5
6      5
7      6
8      6
9      7
10     7
11     8
12     9
13    10
14    10
15    11
16    11
17    11
18    11
19    13
20    11
21    11
22    10
23     9
24     8
Name: Surface Wind (mph), dtype: object

In [87]:
df_weather_needed.dtypes

0
Date                   object
Hour (EDT)              int64
Temperature (°F)      float64
Surface Wind (mph)    float64
Rain                   object
dtype: object

In [86]:
forecast_dic

{'day': 3,
 'first_hour': 0,
 'TMAX': 86.0,
 'TMIN': 70.0,
 'AWND': 8.375,
 'PRCP': 0.0}

In [31]:
df_weather_forecast

Unnamed: 0,Date,Hour (EDT),Temperature (°F),Dewpoint (°F),Heat Index (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder
1,08/02,23,73,60,,6,S,,6,1,64,--,--
2,08/03,0,73,61,,6,SW,,13,2,66,--,--
3,,1,73,61,,6,SW,,16,2,66,--,--
4,,2,72,62,,5,SW,,20,3,71,--,--
5,,3,71,63,,5,SW,,29,3,76,--,--
6,,4,70,64,,5,SW,,21,3,81,--,--
7,,5,70,63,,5,SW,,30,3,78,--,--
8,,6,71,64,,6,SW,,57,5,79,--,--
9,,7,73,64,,6,SW,,58,7,73,--,--
10,,8,75,65,75.0,7,SW,,65,8,71,--,--


# Trash

In [2]:

recent_weather_url = 'https://w1.weather.gov/data/obhistory/KBOS.html'

In [4]:
rw_req = requests.get(recent_weather_url)
rw_req.raise_for_status()
if rw_req.status_code == requests.codes.ok:
    soup = BeautifulSoup(rw_req.text, 'html.parser')

In [11]:
table = soup.find_all('table')[3]

In [16]:
rows = table.find_all('tr')

In [15]:
table

<table border="0" cellpadding="2" cellspacing="3" width="670"><tr align="center" bgcolor="#b0c4de"><th rowspan="3" width="17">D<br/>a<br/>t<br/>e</th><th rowspan="3" width="32">Time<br/>(edt)</th>
<th rowspan="3" width="80">Wind<br/>(mph)</th><th rowspan="3" width="40">Vis.<br/>(mi.)</th><th rowspan="3" width="80">Weather</th><th rowspan="3" width="65">Sky Cond.</th>
<th colspan="4">Temperature (ºF)</th><th rowspan="3" width="65">Relative<br/>Humidity</th><th rowspan="3" width="80">Wind<br/>Chill<br/>(°F)</th><th rowspan="3" width="80">Heat<br/>Index<br/>(°F)</th><th colspan="2">Pressure</th><th colspan="3">Precipitation (in.)</th></tr>
<tr align="center" bgcolor="#b0c4de"><th rowspan="2" width="45">Air</th><th rowspan="2" width="26">Dwpt</th><th colspan="2">6 hour</th>
<th rowspan="2" width="40">altimeter<br/>(in)</th><th rowspan="2" width="40">sea level<br/>(mb)</th><th rowspan="2" width="24">1 hr</th>
<th rowspan="2" width="24">3 hr</th><th rowspan="2" width="30">6 hr</th></tr>
<tr 

In [59]:
num_rows = len(rows)
data_dic_list = []
for ind in range(3, num_rows):#Skip the first three rows
    row = rows[ind]
    pre_data_list = row.find_all('td')
    data_list = [pre_data.text for pre_data in pre_data_list]
    if len(data_list) == 0:
        break
    data_dic = {'day':data_list[0],'hour':data_list[1][:2],'wind':data_list[2], 'description': data_list[4], 'six_max':data_list[8], 'six_min':data_list[9], 'one_prec': data_list[15]}
    data_dic_list.append(data_dic)
df_recent_weather = pd.DataFrame(data_dic_list)
        

In [110]:
day = df_recent_weather.iat[0,0]

In [103]:
ave_wind_speed_extractor(['calm 31 w 1', 'cakm','2'])

11.0

In [105]:
temp_dic

{'day': '01', 'PREC': 0.0, 'TMAX': 89, 'TMIN': 73, 'AVGW': 7.708333333333333}

In [116]:
temp_dic = {'day':day}
df_temp_day = df_recent_weather[df_recent_weather.day == day]
temp_dic['PRCP'] = sum_with_null(df_temp_day['one_prec'].tolist())
temp_dic['TMAX'] = max_with_null(df_temp_day['six_max'].tolist())
temp_dic['TMIN'] = min_with_null(df_temp_day['six_min'].tolist())
temp_dic['WAVG'] = ave_wind_speed_extractor(df_temp_day['wind'].tolist())
temp_dic['SNOW'] = snow_amount_extractor(df_temp_day['description'].tolist(),df_temp_day['one_prec'].tolist())

In [117]:
temp_dic

{'day': '01',
 'PRCP': 0.0,
 'TMAX': 89,
 'TMIN': 73,
 'WAVG': 7.708333333333333,
 'SNOW': 0.0}

['01',
 '23:54',
 'N 8',
 '10.00',
 'Fair',
 'CLR',
 '77',
 '53',
 '',
 '',
 '43%',
 'NA',
 '78',
 '30.10',
 '1019.1',
 '',
 '',
 '']