In [1]:
# Importing the dependencies
import pandas as pd

# London Weather Data

In [2]:
# Loading the London weather data
# Original source of data:  https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt
# Additional information available:  https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data
london_weather_df = pd.read_csv('Resources/Historical_london_weather.csv')
london_weather_df

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
0,1948,1,8.9,3.3,---,85.0,---
1,1948,2,7.9,2.2,---,26.0,---
2,1948,3,14.2,3.8,---,14.0,---
3,1948,4,15.4,5.1,---,35.0,---
4,1948,5,18.1,6.9,---,57.0,---
...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,5,7.2,202.6#
880,2021,5,16.5,7.2,0,84.6,131.9#
881,2021,6,22.5,13.3,0,88.2,159.6#
882,2021,7,24.2,14.9,0,61.2,171.1#


In [3]:
london_weather_df.sample(20)

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
378,1979,7,22.5,13.4,0,21.7,182.5
711,2007,4,18.9,7.7,0,3.6,224.6#
218,1966,3,10.9,2.6,4,12.2,122.6
659,2002,12,9.0,5.2,0,119.0,34.1
766,2011,11,13.6,7.3,0,29.0,52.5#
143,1959,12,9.3,3.0,1,75.7,30.2
474,1987,7,21.8,13.2,0,77.3,176.1
504,1990,1,10.2,4.4,0,71.1,57.6
540,1993,1,10.3,3.5,4,63.1,37.8
721,2008,2,11.0,2.0,7,15.4,130.0#


In [4]:
# Looking for null values
london_weather_df.isnull().sum()

yyyy         0
mm           0
tmax degC    0
tmin degC    0
af days      0
rain mm      0
sun hours    0
dtype: int64

In [5]:
# Dropping the unneeded columns
#london_weather_df = london_weather_df.drop(columns=['af days', 'sun hours'])

In [6]:
# Stripping the # from the sun hours column
london_weather_df['sun hours'] = london_weather_df['sun hours'].map(lambda x: x.rstrip('#'))
london_weather_df

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
0,1948,1,8.9,3.3,---,85.0,---
1,1948,2,7.9,2.2,---,26.0,---
2,1948,3,14.2,3.8,---,14.0,---
3,1948,4,15.4,5.1,---,35.0,---
4,1948,5,18.1,6.9,---,57.0,---
...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,5,7.2,202.6
880,2021,5,16.5,7.2,0,84.6,131.9
881,2021,6,22.5,13.3,0,88.2,159.6
882,2021,7,24.2,14.9,0,61.2,171.1


In [7]:
# Looking at the data types
london_weather_df.dtypes

yyyy           int64
mm             int64
tmax degC    float64
tmin degC    float64
af days       object
rain mm      float64
sun hours     object
dtype: object

In [8]:
# Looking at unique values in af days
london_weather_df['af days'].value_counts()

0      467
1       65
2       44
3       39
4       37
7       36
5       34
11      21
9       19
6       18
8       17
12      15
10      13
13      12
---     12
14       7
15       7
16       5
24       3
17       3
18       3
22       2
28       1
23       1
21       1
19       1
20       1
Name: af days, dtype: int64

In [9]:
# Dropping rows where af days = ---
london_weather_df = london_weather_df.loc[(london_weather_df['af days'] != '---')]
london_weather_df

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
12,1949,1,8.5,1.8,9,23.0,---
13,1949,2,10.4,0.6,11,27.0,---
14,1949,3,9.3,1.2,11,26.1,---
15,1949,4,16.2,6.0,1,34.2,---
16,1949,5,17.1,6.8,0,56.9,---
...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,5,7.2,202.6
880,2021,5,16.5,7.2,0,84.6,131.9
881,2021,6,22.5,13.3,0,88.2,159.6
882,2021,7,24.2,14.9,0,61.2,171.1


In [10]:
# Verifying by checking af days value counts
london_weather_df['af days'].value_counts()

0     467
1      65
2      44
3      39
4      37
7      36
5      34
11     21
9      19
6      18
8      17
12     15
10     13
13     12
14      7
15      7
16      5
17      3
18      3
24      3
22      2
28      1
23      1
21      1
19      1
20      1
Name: af days, dtype: int64

In [11]:
# Dropping rows where sun hours == ---
london_weather_df = london_weather_df.loc[(london_weather_df['sun hours'] != '---')]
london_weather_df

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
108,1957,1,8.7,2.7,5,39.5,53
109,1957,2,9.0,2.9,5,69.8,64.9
110,1957,3,13.9,5.7,2,25.4,96.7
111,1957,4,14.2,5.2,1,5.7,169.6
112,1957,5,16.2,6.5,0,21.3,195
...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,5,7.2,202.6
880,2021,5,16.5,7.2,0,84.6,131.9
881,2021,6,22.5,13.3,0,88.2,159.6
882,2021,7,24.2,14.9,0,61.2,171.1


In [12]:
# Converting sun hours and af days to numerics
#nyc_weather_df['monthly total snowfall'] = nyc_weather_df.loc[:, 'monthly total snowfall'].astype(float)
london_weather_df['af days'] = london_weather_df.loc[:, 'af days'].astype(int)
london_weather_df['sun hours'] = london_weather_df.loc[:, 'sun hours'].astype(float)
london_weather_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


yyyy           int64
mm             int64
tmax degC    float64
tmin degC    float64
af days        int32
rain mm      float64
sun hours    float64
dtype: object

In [13]:
# Converting temps to degF
london_weather_df['tmax degF'] = (london_weather_df['tmax degC'] * 9/5) + 32
london_weather_df['tmin degF'] = (london_weather_df['tmin degC'] * 9/5) + 32

# Converting mm to inches
london_weather_df['rain inches'] = london_weather_df['rain mm'] / 25.4

london_weather_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,tmax degF,tmin degF,rain inches
108,1957,1,8.7,2.7,5,39.5,53.0,47.66,36.86,1.555118
109,1957,2,9.0,2.9,5,69.8,64.9,48.20,37.22,2.748031
110,1957,3,13.9,5.7,2,25.4,96.7,57.02,42.26,1.000000
111,1957,4,14.2,5.2,1,5.7,169.6,57.56,41.36,0.224409
112,1957,5,16.2,6.5,0,21.3,195.0,61.16,43.70,0.838583
...,...,...,...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,5,7.2,202.6,55.58,37.22,0.283465
880,2021,5,16.5,7.2,0,84.6,131.9,61.70,44.96,3.330709
881,2021,6,22.5,13.3,0,88.2,159.6,72.50,55.94,3.472441
882,2021,7,24.2,14.9,0,61.2,171.1,75.56,58.82,2.409449


In [14]:
# Looking at the summary statistics
london_weather_df.describe()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,tmax degF,tmin degF,rain inches
count,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0
mean,1988.835052,6.479381,15.028737,7.229124,2.809278,50.67384,129.15451,59.051727,45.012423,1.995033
std,18.680433,3.45023,5.80043,4.457594,4.540253,29.791852,64.907869,10.440775,8.023669,1.172908
min,1957.0,1.0,0.8,-4.6,0.0,0.3,18.5,33.44,23.72,0.011811
25%,1973.0,3.0,9.9,3.5,0.0,28.175,68.475,49.82,38.3,1.109252
50%,1989.0,6.0,14.9,6.9,0.0,46.65,126.15,58.82,44.42,1.836614
75%,2005.0,9.0,20.3,11.3,4.0,69.15,176.375,68.54,52.34,2.722441
max,2021.0,12.0,28.3,16.7,28.0,174.8,310.1,82.94,62.06,6.88189


In [15]:
# Grouping by months
grouped_london_weather = london_weather_df.groupby(['mm'])

In [16]:
# Creating a DataFrame with the monthly average values
monthly_mean_london_weather_df = grouped_london_weather.mean()

# Removing the index name from the DataFrame
monthly_mean_london_weather_df.index.name = None

# Dropping the unneeded columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df.drop(columns=['yyyy', 'tmax degC', 'tmin degC', 'rain mm'])

In [17]:
# Adding the month names to the DataFrame
# Creating the month dictionary
month_dict = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}

# Adding the month name to the DataFrames
monthly_mean_london_weather_df['Month'] = monthly_mean_london_weather_df.index.map(month_dict)

# Reordering the columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df[['Month', 'tmax degF', 'tmin degF', 'rain inches', 'af days', 'sun hours']]

# Formatting the values
monthly_mean_london_weather_df['tmax degF'] = monthly_mean_london_weather_df['tmax degF'].map('{:.2f}'.format)
monthly_mean_london_weather_df['tmin degF'] = monthly_mean_london_weather_df['tmin degF'].map('{:.2f}'.format)
monthly_mean_london_weather_df['rain inches'] = monthly_mean_london_weather_df['rain inches'].map('{:.2f}'.format)
monthly_mean_london_weather_df['af days'] = monthly_mean_london_weather_df['af days'].map('{:.2f}'.format)
monthly_mean_london_weather_df['sun hours'] = monthly_mean_london_weather_df['sun hours'].map('{:.2f}'.format)


# Renaming the columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df.rename(columns={
    'tmax degF': 'avg high temp (F)',
    'tmin degF': 'avg low temp (F)',
    'rain inches': 'avg total rainfall (inches)',
    'af days': 'avg days of air frost',
    'sun hours': 'avg total sunshine duration (hours)'
})

# Displaying the updated DataFrame
monthly_mean_london_weather_df

Unnamed: 0,Month,avg high temp (F),avg low temp (F),avg total rainfall (inches),avg days of air frost,avg total sunshine duration (hours)
1,Jan,45.89,35.56,2.19,8.92,54.7
2,Feb,46.91,35.73,1.57,7.65,70.76
3,Mar,51.87,38.25,1.66,4.2,113.16
4,Apr,57.26,41.59,1.64,1.38,158.56
5,May,63.83,47.24,1.9,0.06,194.47
6,Jun,69.89,52.71,1.95,0.0,198.94
7,Jul,73.66,56.5,1.87,0.0,199.05
8,Aug,72.78,56.11,2.08,0.0,186.43
9,Sep,67.59,52.11,1.98,0.0,146.77
10,Oct,60.08,47.08,2.45,0.27,108.42


In [18]:
# Saving the DataFrame as a CSV file
monthly_mean_london_weather_df.to_csv('Output/london_mean_monthly_weather.csv')

In [42]:
# Creating DataFrame of each year and month to use with our ML model
london_weather_output_df = london_weather_df.drop(columns=['tmax degC', 'tmin degC', 'rain mm'], axis=1)

# Reordering columns
london_weather_output_df = london_weather_output_df[['yyyy', 'mm', 'tmax degF', 'tmin degF', 'rain inches', 'af days', 'sun hours']]

london_weather_output_df

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,rain inches,af days,sun hours
108,1957,1,47.66,36.86,1.555118,5,53.0
109,1957,2,48.20,37.22,2.748031,5,64.9
110,1957,3,57.02,42.26,1.000000,2,96.7
111,1957,4,57.56,41.36,0.224409,1,169.6
112,1957,5,61.16,43.70,0.838583,0,195.0
...,...,...,...,...,...,...,...
879,2021,4,55.58,37.22,0.283465,5,202.6
880,2021,5,61.70,44.96,3.330709,0,131.9
881,2021,6,72.50,55.94,3.472441,0,159.6
882,2021,7,75.56,58.82,2.409449,0,171.1


In [47]:
# Saving the DataFrame to a CSV
london_weather_output_df.to_csv('Output/london_yyyy_mm_weather.csv')

In [19]:
# Capturing the monthly extreme values
# @ TO DO?  Would it be helpful to add af days and sun hours?
monthly_london_max_temps = london_weather_df.groupby(['mm']).max()['tmax degF']
monthly_london_min_temps = london_weather_df.groupby(['mm']).min()['tmin degF']
monthly_london_max_rain = london_weather_df.groupby(['mm']).max()['rain inches']
monthly_london_min_rain = london_weather_df.groupby(['mm']).min()['rain inches']

In [20]:
# Creating a DataFrame with the extremes
monthly_extremes_london_weather_df = pd.DataFrame({
    'max monthly avg high temp (F)': monthly_london_max_temps,
    'min monthly avg low temp (F)': monthly_london_min_temps,
    'max total rainfall (inches)': monthly_london_max_rain,
    'min total rainfall (inches)': monthly_london_min_rain})

# Removing the index name from the DataFrame
monthly_extremes_london_weather_df.index.name = None

# Adding the month names to the DataFrame
monthly_extremes_london_weather_df['Month'] = monthly_extremes_london_weather_df.index.map(month_dict)

# Reordering the columns
monthly_extremes_london_weather_df = monthly_extremes_london_weather_df[['Month', 'max monthly avg high temp (F)', 
                                            'min monthly avg low temp (F)', 'max total rainfall (inches)', 'min total rainfall (inches)']]

# Formatting the values
monthly_extremes_london_weather_df['max monthly avg high temp (F)'] = monthly_extremes_london_weather_df['max monthly avg high temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['min monthly avg low temp (F)'] = monthly_extremes_london_weather_df['min monthly avg low temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['max total rainfall (inches)'] = monthly_extremes_london_weather_df['max total rainfall (inches)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['min total rainfall (inches)'] = monthly_extremes_london_weather_df['min total rainfall (inches)']\
    .map('{:.2f}'.format)

# Displaying the updated DataFrame
monthly_extremes_london_weather_df

Unnamed: 0,Month,max monthly avg high temp (F),min monthly avg low temp (F),max total rainfall (inches),min total rainfall (inches)
1,Jan,50.9,23.72,6.39,0.41
2,Feb,54.32,27.14,3.95,0.09
3,Mar,58.46,30.92,3.75,0.22
4,Apr,67.46,37.22,4.0,0.09
5,May,69.98,42.62,4.02,0.08
6,Jun,77.9,48.2,5.03,0.02
7,Jul,82.94,53.06,4.54,0.27
8,Aug,80.6,52.7,5.92,0.01
9,Sep,72.86,46.58,5.51,0.12
10,Oct,64.94,41.0,6.88,0.06


# NYC Weather Data

In [21]:
# Loading in the DataFrame
# Source:  Source:  https://www.weather.gov/wrh/Climate?wfo=okx, copied into CSV file to mimic format of London weather data
nyc_weather_df = pd.read_csv('Resources/Flatened_historical_nyc_weather.csv')
nyc_weather_df.head(10)

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,,,,,,31.2,19.6,4.74,15.3
1,1948,2,,,,,,37.8,23.6,2.52,13.6
2,1948,3,,,,,,50.6,33.5,3.51,4.8
3,1948,4,,,,,,58.9,43.1,3.26,0.0
4,1948,5,,,,,,67.6,52.8,7.58,0.0
5,1948,6,,,,,,77.5,61.3,5.52,0.0
6,1948,7,,,,,,85.3,69.0,4.51,0.0
7,1948,8,,,,,,82.7,67.8,2.99,0.0
8,1948,9,,,,,,79.4,61.1,1.28,0.0
9,1948,10,,,,,,64.4,48.9,1.51,0.0


In [22]:
# Dropping unused columns
nyc_weather_df = nyc_weather_df.drop(columns=['tmax degC', 'tmin degC', 'af days', 'rain mm', 'sun hours'], axis=1)
nyc_weather_df

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,31.2,19.6,4.74,15.3
1,1948,2,37.8,23.6,2.52,13.6
2,1948,3,50.6,33.5,3.51,4.8
3,1948,4,58.9,43.1,3.26,0
4,1948,5,67.6,52.8,7.58,0
...,...,...,...,...,...,...
879,2021,4,63.7,45.5,2.69,0
880,2021,5,71.7,54.0,4.36,0
881,2021,6,82.5,66.0,2.62,0
882,2021,7,83.0,69.0,11.09,


In [23]:
# Looking at the data types
nyc_weather_df.dtypes

yyyy                        int64
mm                          int64
tmax degF                 float64
tmin degF                 float64
monthly total precip      float64
monthly total snowfall     object
dtype: object

In [24]:
# Replacing T (trace) and N/A (for Jul and Aug 2021) with 0 for snowfall
nyc_weather_df['monthly total snowfall'] = nyc_weather_df['monthly total snowfall'].replace('T', 0)
nyc_weather_df['monthly total snowfall'] = nyc_weather_df['monthly total snowfall'].fillna(0)
nyc_weather_df

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,31.2,19.6,4.74,15.3
1,1948,2,37.8,23.6,2.52,13.6
2,1948,3,50.6,33.5,3.51,4.8
3,1948,4,58.9,43.1,3.26,0
4,1948,5,67.6,52.8,7.58,0
...,...,...,...,...,...,...
879,2021,4,63.7,45.5,2.69,0
880,2021,5,71.7,54.0,4.36,0
881,2021,6,82.5,66.0,2.62,0
882,2021,7,83.0,69.0,11.09,0


In [25]:
# Converting monthly total snowfall to float
nyc_weather_df['monthly total snowfall'] = nyc_weather_df.loc[:, 'monthly total snowfall'].astype(float)
nyc_weather_df.dtypes

yyyy                        int64
mm                          int64
tmax degF                 float64
tmin degF                 float64
monthly total precip      float64
monthly total snowfall    float64
dtype: object

In [26]:
# Looking for null values
nyc_weather_df.isnull().sum()

yyyy                      0
mm                        0
tmax degF                 0
tmin degF                 0
monthly total precip      0
monthly total snowfall    0
dtype: int64

In [27]:
# Looking at the summary statistics
nyc_weather_df.describe()

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
count,884.0,884.0,884.0,884.0,884.0,884.0
mean,1984.334842,6.4819,62.597398,47.637896,3.944231,2.220136
std,21.278419,3.45046,16.542393,15.099799,2.25396,5.039028
min,1948.0,1.0,27.7,15.8,0.02,0.0
25%,1966.0,3.0,47.275,34.0,2.3475,0.0
50%,1984.0,6.0,63.35,46.8,3.58,0.0
75%,2003.0,9.0,78.325,62.3,4.97,1.7
max,2021.0,12.0,90.3,73.4,18.95,36.9


In [28]:
# Sampling 10 rows to validate data
nyc_weather_df.sample(10)
# All validated correctly

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
167,1961,12,41.0,30.0,3.04,7.7
72,1954,1,38.1,23.5,1.65,12.7
173,1962,6,82.1,62.8,3.73,0.0
833,2017,6,79.3,64.7,4.76,0.0
201,1964,10,63.7,46.3,1.73,0.0
172,1962,5,74.9,53.8,1.26,0.0
255,1969,4,65.0,46.8,3.99,0.0
10,1948,11,58.7,46.1,3.17,0.0
325,1975,2,41.7,29.9,3.33,10.6
726,2008,7,86.2,70.6,2.84,0.0


In [29]:
# Looking at a few more total snowfalls since most validated were 0
#nyc_weather_df[['yyyy', 'mm', 'monthly total snowfall']].sample(5)
# All validated correctly

In [30]:
# Grouping by months
grouped_nyc_weather = nyc_weather_df.groupby(['mm'])

In [31]:
# Creating a DataFrame with the monthly average values
monthly_mean_nyc_weather_df = grouped_nyc_weather.mean()

# Removing the index name from the DataFrame
monthly_mean_nyc_weather_df.index.name = None

# Dropping the unneeded column
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df.drop(columns='yyyy', axis=1)

# Viewing the new DataFrame
monthly_mean_nyc_weather_df

Unnamed: 0,tmax degF,tmin degF,monthly total precip,monthly total snowfall
1,38.854054,26.540541,3.464865,7.566216
2,41.477027,28.005405,3.206892,8.785135
3,49.659459,34.701351,4.14027,4.556757
4,61.494595,44.371622,4.052432,0.460811
5,71.512162,53.894595,4.100811,0.0
6,80.087838,63.308108,3.819459,0.0
7,85.213514,68.904054,4.402973,0.0
8,83.563514,67.758108,4.44027,0.0
9,76.341096,60.749315,3.856986,0.0
10,65.279452,50.391781,3.854247,0.046575


In [32]:
# Adding the month name to the DataFrames
monthly_mean_nyc_weather_df['Month'] = monthly_mean_nyc_weather_df.index.map(month_dict)

# Formatting the values
monthly_mean_nyc_weather_df['tmax degF'] = monthly_mean_nyc_weather_df['tmax degF'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['tmin degF'] = monthly_mean_nyc_weather_df['tmin degF'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['monthly total precip'] = monthly_mean_nyc_weather_df['monthly total precip'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['monthly total snowfall'] = monthly_mean_nyc_weather_df['monthly total snowfall'].map('{:.2f}'.format)



# Renaming the columns
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df.rename(columns={
    'tmax degF': 'avg high temp (F)',
    'tmin degF': 'avg low temp (F)',
    'monthly total precip': 'avg total precipitation (inches)',
    'monthly total snowfall': 'avg total snowfall (inches)'
})

# Reordering the columns
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df[['Month', 'avg high temp (F)', 'avg low temp (F)', 
                                                   'avg total precipitation (inches)', 'avg total snowfall (inches)']]

# Displaying the updated DataFrame
monthly_mean_nyc_weather_df
# All validate with original dataset

Unnamed: 0,Month,avg high temp (F),avg low temp (F),avg total precipitation (inches),avg total snowfall (inches)
1,Jan,38.85,26.54,3.46,7.57
2,Feb,41.48,28.01,3.21,8.79
3,Mar,49.66,34.7,4.14,4.56
4,Apr,61.49,44.37,4.05,0.46
5,May,71.51,53.89,4.1,0.0
6,Jun,80.09,63.31,3.82,0.0
7,Jul,85.21,68.9,4.4,0.0
8,Aug,83.56,67.76,4.44,0.0
9,Sep,76.34,60.75,3.86,0.0
10,Oct,65.28,50.39,3.85,0.05


In [35]:
# Saving the DataFrame as a CSV
monthly_mean_nyc_weather_df.to_csv('Output/nyc_mean_monthly_weather.csv')

In [33]:
# Capturing the monthly extreme values
monthly_nyc_max_temps = nyc_weather_df.groupby(['mm']).max()['tmax degF']
monthly_nyc_min_temps = nyc_weather_df.groupby(['mm']).min()['tmin degF']
monthly_nyc_max_rain = nyc_weather_df.groupby(['mm']).max()['monthly total precip']
monthly_nyc_min_rain = nyc_weather_df.groupby(['mm']).min()['monthly total precip']
monthly_nyc_max_snow = nyc_weather_df.groupby(['mm']).max()['monthly total snowfall']
monthly_nyc_min_snow = nyc_weather_df.groupby(['mm']).min()['monthly total snowfall']

In [34]:
# Creating a DataFrame with the extremes
monthly_extremes_nyc_weather_df = pd.DataFrame({
    'max monthly avg high temp (F)': monthly_nyc_max_temps,
    'min monthly avg low temp (F)': monthly_nyc_min_temps,
    'max total rainfall (inches)': monthly_nyc_max_rain,
    'min total rainfall (inches)': monthly_nyc_min_rain,
    'max total snowfall (inches)': monthly_nyc_max_snow,
    'min total snowfall (inches)': monthly_nyc_min_snow})

# Removing the index name from the DataFrame
monthly_extremes_nyc_weather_df.index.name = None

# Adding the month names to the DataFrame
monthly_extremes_nyc_weather_df['Month'] = monthly_extremes_nyc_weather_df.index.map(month_dict)

# Reordering the columns
monthly_extremes_nyc_weather_df = monthly_extremes_nyc_weather_df[['Month', 'max monthly avg high temp (F)', 
                                            'min monthly avg low temp (F)', 'max total rainfall (inches)', 
                                            'min total rainfall (inches)', 'max total snowfall (inches)',
                                            'min total snowfall (inches)']]

# Formatting the values
monthly_extremes_nyc_weather_df['max monthly avg high temp (F)'] = monthly_extremes_nyc_weather_df['max monthly avg high temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['min monthly avg low temp (F)'] = monthly_extremes_nyc_weather_df['min monthly avg low temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['max total rainfall (inches)'] = monthly_extremes_nyc_weather_df['max total rainfall (inches)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['min total rainfall (inches)'] = monthly_extremes_nyc_weather_df['min total rainfall (inches)']\
    .map('{:.2f}'.format)

# Displaying the updated DataFrame
monthly_extremes_nyc_weather_df

Unnamed: 0,Month,max monthly avg high temp (F),min monthly avg low temp (F),max total rainfall (inches),min total rainfall (inches),max total snowfall (inches),min total snowfall (inches)
1,Jan,49.1,16.4,10.52,0.58,36.0,0.0
2,Feb,49.6,15.8,6.69,0.71,36.9,0.0
3,Mar,59.2,26.3,10.69,0.8,21.1,0.0
4,Apr,67.3,39.1,14.01,1.27,9.6,0.0
5,May,78.9,46.9,10.24,0.57,0.0,0.0
6,Jun,85.6,58.8,10.26,0.02,0.0,0.0
7,Jul,90.3,64.3,11.77,0.44,0.0,0.0
8,Aug,88.3,62.7,18.95,0.18,0.0,0.0
9,Sep,82.6,54.0,11.51,0.48,0.0,0.0
10,Oct,72.1,45.1,16.73,0.14,2.9,0.0


In [46]:
# Outputting the NYC weather DataFrame as CSV with each month and year for our ML model to take in
nyc_weather_df.to_csv('Output/nyc_yyyy_mm_weather.csv')