In [1]:
# Importing the dependencies
import pandas as pd

# London Weather Data

In [27]:
# Loading the London weather data
# Original source of data:  https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/heathrowdata.txt
# Additional information available:  https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data
london_weather_df = pd.read_csv('Historical_london_weather.csv')
london_weather_df.head(20)

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
0,1948,1,8.9,3.3,---,85.0,---
1,1948,2,7.9,2.2,---,26.0,---
2,1948,3,14.2,3.8,---,14.0,---
3,1948,4,15.4,5.1,---,35.0,---
4,1948,5,18.1,6.9,---,57.0,---
5,1948,6,19.1,10.3,---,67.0,---
6,1948,7,21.7,12.0,---,21.0,---
7,1948,8,20.8,11.7,---,67.0,---
8,1948,9,19.6,10.2,---,35.0,---
9,1948,10,14.9,6.0,---,50.0,---


In [28]:
london_weather_df.sample(20)

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours
33,1950,10,14.0,6.1,3,10.9,---
540,1993,1,10.3,3.5,4,63.1,37.8
789,2013,10,17.0,10.6,0,81.4,89.6#
217,1966,2,9.3,4.4,1,73.4,27
368,1978,9,20.1,10.8,0,17.1,166.7
842,2018,3,9.8,3.0,6,81.2,70.3#
466,1986,11,12.1,5.2,0,66.1,73.1
703,2006,8,22.2,13.5,0,68.8,140.2#
200,1964,9,21.1,10.6,0,11.1,213.7
643,2001,8,23.5,14.5,0,79.0,210.9


In [3]:
# Looking for null values
london_weather_df.isnull().sum()

yyyy         0
mm           0
tmax degC    0
tmin degC    0
af days      0
rain mm      0
sun hours    0
dtype: int64

In [4]:
# Dropping the unneeded columns
london_weather_df = london_weather_df.drop(columns=['af days', 'sun hours'])

In [5]:
# Looking at the data types
london_weather_df.dtypes

yyyy           int64
mm             int64
tmax degC    float64
tmin degC    float64
rain mm      float64
dtype: object

In [6]:
# Converting temps to degF
london_weather_df['tmax degF'] = (london_weather_df['tmax degC'] * 9/5) + 32
london_weather_df['tmin degF'] = (london_weather_df['tmin degC'] * 9/5) + 32

# Converting mm to inches
london_weather_df['rain inches'] = london_weather_df['rain mm'] / 25.4

london_weather_df

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,rain mm,tmax degF,tmin degF,rain inches
0,1948,1,8.9,3.3,85.0,48.02,37.94,3.346457
1,1948,2,7.9,2.2,26.0,46.22,35.96,1.023622
2,1948,3,14.2,3.8,14.0,57.56,38.84,0.551181
3,1948,4,15.4,5.1,35.0,59.72,41.18,1.377953
4,1948,5,18.1,6.9,57.0,64.58,44.42,2.244094
...,...,...,...,...,...,...,...,...
879,2021,4,13.1,2.9,7.2,55.58,37.22,0.283465
880,2021,5,16.5,7.2,84.6,61.70,44.96,3.330709
881,2021,6,22.5,13.3,88.2,72.50,55.94,3.472441
882,2021,7,24.2,14.9,61.2,75.56,58.82,2.409449


In [7]:
# Looking at the summary statistics
london_weather_df.describe()

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,rain mm,tmax degF,tmin degF,rain inches
count,884.0,884.0,884.0,884.0,884.0,884.0,884.0,884.0
mean,1984.334842,6.4819,14.947172,7.105317,50.573529,58.90491,44.78957,1.991084
std,21.278419,3.45046,5.794542,4.465187,29.805211,10.430176,8.037337,1.173434
min,1948.0,1.0,0.8,-4.6,0.3,33.44,23.72,0.011811
25%,1966.0,3.0,9.8,3.375,27.95,49.64,38.075,1.100394
50%,1984.0,6.0,14.75,6.65,46.8,58.55,43.97,1.84252
75%,2003.0,9.0,20.1,11.2,68.8,68.18,52.16,2.708661
max,2021.0,12.0,28.3,16.7,174.8,82.94,62.06,6.88189


In [8]:
# Grouping by months
grouped_london_weather = london_weather_df.groupby(['mm'])

In [9]:
# Creating a DataFrame with the monthly average values
monthly_mean_london_weather_df = grouped_london_weather.mean()

# Removing the index name from the DataFrame
monthly_mean_london_weather_df.index.name = None

# Dropping the unneeded columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df.drop(columns=['yyyy', 'tmax degC', 'tmin degC', 'rain mm'])

In [10]:
# Adding the month names to the DataFrame
# Creating the month dictionary
month_dict = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}

# Adding the month name to the DataFrames
monthly_mean_london_weather_df['Month'] = monthly_mean_london_weather_df.index.map(month_dict)

# Reordering the columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df[['Month', 'tmax degF', 'tmin degF', 'rain inches']]
monthly_mean_london_weather_df

# Formatting the values
monthly_mean_london_weather_df['tmax degF'] = monthly_mean_london_weather_df['tmax degF'].map('{:.2f}'.format)
monthly_mean_london_weather_df['tmin degF'] = monthly_mean_london_weather_df['tmin degF'].map('{:.2f}'.format)
monthly_mean_london_weather_df['rain inches'] = monthly_mean_london_weather_df['rain inches'].map('{:.2f}'.format)

# Renaming the columns
monthly_mean_london_weather_df = monthly_mean_london_weather_df.rename(columns={
    'tmax degF': 'avg high temp (F)',
    'tmin degF': 'avg low temp (F)',
    'rain inches': 'avg total rainfall (inches)'
})

# Displaying the updated DataFrame
monthly_mean_london_weather_df

Unnamed: 0,Month,avg high temp (F),avg low temp (F),avg total rainfall (inches)
1,Jan,45.71,35.39,2.15
2,Feb,46.64,35.34,1.59
3,Mar,51.77,38.0,1.62
4,Apr,57.25,41.37,1.62
5,May,63.78,47.04,1.91
6,Jun,69.73,52.53,1.93
7,Jul,73.46,56.29,1.9
8,Aug,72.54,55.86,2.13
9,Sep,67.45,51.94,1.98
10,Oct,59.87,46.69,2.46


In [11]:
# Capturing the monthly extreme values
monthly_london_max_temps = london_weather_df.groupby(['mm']).max()['tmax degF']
monthly_london_min_temps = london_weather_df.groupby(['mm']).min()['tmin degF']
monthly_london_max_rain = london_weather_df.groupby(['mm']).max()['rain inches']
monthly_london_min_rain = london_weather_df.groupby(['mm']).min()['rain inches']

In [12]:
# Creating a DataFrame with the extremes
monthly_extremes_london_weather_df = pd.DataFrame({
    'max monthly avg high temp (F)': monthly_london_max_temps,
    'min monthly avg low temp (F)': monthly_london_min_temps,
    'max total rainfall (inches)': monthly_london_max_rain,
    'min total rainfall (inches)': monthly_london_min_rain})

# Removing the index name from the DataFrame
monthly_extremes_london_weather_df.index.name = None

# Adding the month names to the DataFrame
monthly_extremes_london_weather_df['Month'] = monthly_extremes_london_weather_df.index.map(month_dict)

# Reordering the columns
monthly_extremes_london_weather_df = monthly_extremes_london_weather_df[['Month', 'max monthly avg high temp (F)', 
                                            'min monthly avg low temp (F)', 'max total rainfall (inches)', 'min total rainfall (inches)']]

# Formatting the values
monthly_extremes_london_weather_df['max monthly avg high temp (F)'] = monthly_extremes_london_weather_df['max monthly avg high temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['min monthly avg low temp (F)'] = monthly_extremes_london_weather_df['min monthly avg low temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['max total rainfall (inches)'] = monthly_extremes_london_weather_df['max total rainfall (inches)']\
    .map('{:.2f}'.format)
monthly_extremes_london_weather_df['min total rainfall (inches)'] = monthly_extremes_london_weather_df['min total rainfall (inches)']\
    .map('{:.2f}'.format)

# Displaying the updated DataFrame
monthly_extremes_london_weather_df

Unnamed: 0,Month,max monthly avg high temp (F),min monthly avg low temp (F),max total rainfall (inches),min total rainfall (inches)
1,Jan,50.9,23.72,6.39,0.41
2,Feb,54.32,25.52,4.78,0.09
3,Mar,58.46,30.92,3.75,0.22
4,Apr,67.46,36.32,4.0,0.09
5,May,69.98,42.62,4.06,0.08
6,Jun,77.9,48.2,5.03,0.02
7,Jul,82.94,53.06,5.14,0.27
8,Aug,80.6,51.62,5.92,0.01
9,Sep,73.04,45.68,5.51,0.12
10,Oct,64.94,41.0,6.88,0.06


# NYC Weather Data

In [13]:
# Loading in the DataFrame
# Source:  Source:  https://www.weather.gov/wrh/Climate?wfo=okx, copied into CSV file to mimic format of London weather data
nyc_weather_df = pd.read_csv('Flatened_historical_nyc_weather.csv')
nyc_weather_df.head(10)

Unnamed: 0,yyyy,mm,tmax degC,tmin degC,af days,rain mm,sun hours,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,,,,,,31.2,19.6,4.74,15.3
1,1948,2,,,,,,37.8,23.6,2.52,13.6
2,1948,3,,,,,,50.6,33.5,3.51,4.8
3,1948,4,,,,,,58.9,43.1,3.26,0.0
4,1948,5,,,,,,67.6,52.8,7.58,0.0
5,1948,6,,,,,,77.5,61.3,5.52,0.0
6,1948,7,,,,,,85.3,69.0,4.51,0.0
7,1948,8,,,,,,82.7,67.8,2.99,0.0
8,1948,9,,,,,,79.4,61.1,1.28,0.0
9,1948,10,,,,,,64.4,48.9,1.51,0.0


In [14]:
# Dropping unused columns
nyc_weather_df = nyc_weather_df.drop(columns=['tmax degC', 'tmin degC', 'af days', 'rain mm', 'sun hours'], axis=1)
nyc_weather_df

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,31.2,19.6,4.74,15.3
1,1948,2,37.8,23.6,2.52,13.6
2,1948,3,50.6,33.5,3.51,4.8
3,1948,4,58.9,43.1,3.26,0
4,1948,5,67.6,52.8,7.58,0
...,...,...,...,...,...,...
879,2021,4,63.7,45.5,2.69,0
880,2021,5,71.7,54.0,4.36,0
881,2021,6,82.5,66.0,2.62,0
882,2021,7,83.0,69.0,11.09,


In [15]:
# Looking at the data types
nyc_weather_df.dtypes

yyyy                        int64
mm                          int64
tmax degF                 float64
tmin degF                 float64
monthly total precip      float64
monthly total snowfall     object
dtype: object

In [16]:
# Replacing T (trace) and N/A (for Jul and Aug 2021) with 0 for snowfall
nyc_weather_df['monthly total snowfall'] = nyc_weather_df['monthly total snowfall'].replace('T', 0)
nyc_weather_df['monthly total snowfall'] = nyc_weather_df['monthly total snowfall'].fillna(0)
nyc_weather_df

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
0,1948,1,31.2,19.6,4.74,15.3
1,1948,2,37.8,23.6,2.52,13.6
2,1948,3,50.6,33.5,3.51,4.8
3,1948,4,58.9,43.1,3.26,0
4,1948,5,67.6,52.8,7.58,0
...,...,...,...,...,...,...
879,2021,4,63.7,45.5,2.69,0
880,2021,5,71.7,54.0,4.36,0
881,2021,6,82.5,66.0,2.62,0
882,2021,7,83.0,69.0,11.09,0


In [17]:
# Converting monthly total snowfall to float
nyc_weather_df['monthly total snowfall'] = nyc_weather_df.loc[:, 'monthly total snowfall'].astype(float)
nyc_weather_df.dtypes

yyyy                        int64
mm                          int64
tmax degF                 float64
tmin degF                 float64
monthly total precip      float64
monthly total snowfall    float64
dtype: object

In [18]:
# Looking for null values
nyc_weather_df.isnull().sum()

yyyy                      0
mm                        0
tmax degF                 0
tmin degF                 0
monthly total precip      0
monthly total snowfall    0
dtype: int64

In [19]:
# Looking at the summary statistics
nyc_weather_df.describe()

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
count,884.0,884.0,884.0,884.0,884.0,884.0
mean,1984.334842,6.4819,62.597398,47.637896,3.944231,2.220136
std,21.278419,3.45046,16.542393,15.099799,2.25396,5.039028
min,1948.0,1.0,27.7,15.8,0.02,0.0
25%,1966.0,3.0,47.275,34.0,2.3475,0.0
50%,1984.0,6.0,63.35,46.8,3.58,0.0
75%,2003.0,9.0,78.325,62.3,4.97,1.7
max,2021.0,12.0,90.3,73.4,18.95,36.9


In [20]:
# Sampling 10 rows to validate data
nyc_weather_df.sample(10)
# All validated correctly

Unnamed: 0,yyyy,mm,tmax degF,tmin degF,monthly total precip,monthly total snowfall
151,1960,8,82.5,67.3,6.26,0.0
755,2010,12,37.7,27.9,4.24,20.1
399,1981,4,65.1,47.2,3.42,0.0
243,1968,4,65.3,44.6,2.82,0.0
146,1960,3,40.5,26.3,2.96,18.5
574,1995,11,50.1,37.1,5.78,2.9
87,1955,4,62.0,44.9,1.97,0.0
451,1985,8,82.4,68.4,2.58,0.0
446,1985,3,54.9,36.6,1.91,0.2
545,1993,6,83.2,63.3,1.49,0.0


In [21]:
# Looking at a few more total snowfalls since most validated were 0
#nyc_weather_df[['yyyy', 'mm', 'monthly total snowfall']].sample(5)
# All validated correctly

In [22]:
# Grouping by months
grouped_nyc_weather = nyc_weather_df.groupby(['mm'])

In [23]:
# Creating a DataFrame with the monthly average values
monthly_mean_nyc_weather_df = grouped_nyc_weather.mean()

# Removing the index name from the DataFrame
monthly_mean_nyc_weather_df.index.name = None

# Dropping the unneeded column
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df.drop(columns='yyyy', axis=1)

# Viewing the new DataFrame
monthly_mean_nyc_weather_df

Unnamed: 0,tmax degF,tmin degF,monthly total precip,monthly total snowfall
1,38.854054,26.540541,3.464865,7.566216
2,41.477027,28.005405,3.206892,8.785135
3,49.659459,34.701351,4.14027,4.556757
4,61.494595,44.371622,4.052432,0.460811
5,71.512162,53.894595,4.100811,0.0
6,80.087838,63.308108,3.819459,0.0
7,85.213514,68.904054,4.402973,0.0
8,83.563514,67.758108,4.44027,0.0
9,76.341096,60.749315,3.856986,0.0
10,65.279452,50.391781,3.854247,0.046575


In [24]:
# Adding the month name to the DataFrames
monthly_mean_nyc_weather_df['Month'] = monthly_mean_nyc_weather_df.index.map(month_dict)

# Formatting the values
monthly_mean_nyc_weather_df['tmax degF'] = monthly_mean_nyc_weather_df['tmax degF'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['tmin degF'] = monthly_mean_nyc_weather_df['tmin degF'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['monthly total precip'] = monthly_mean_nyc_weather_df['monthly total precip'].map('{:.2f}'.format)
monthly_mean_nyc_weather_df['monthly total snowfall'] = monthly_mean_nyc_weather_df['monthly total snowfall'].map('{:.2f}'.format)



# Renaming the columns
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df.rename(columns={
    'tmax degF': 'avg high temp (F)',
    'tmin degF': 'avg low temp (F)',
    'monthly total precip': 'avg total precipitation (inches)',
    'monthly total snowfall': 'avg total snowfall (inches)'
})

# Reordering the columns
monthly_mean_nyc_weather_df = monthly_mean_nyc_weather_df[['Month', 'avg high temp (F)', 'avg low temp (F)', 
                                                   'avg total precipitation (inches)', 'avg total snowfall (inches)']]

# Displaying the updated DataFrame
monthly_mean_nyc_weather_df
# All validate with original dataset

Unnamed: 0,Month,avg high temp (F),avg low temp (F),avg total precipitation (inches),avg total snowfall (inches)
1,Jan,38.85,26.54,3.46,7.57
2,Feb,41.48,28.01,3.21,8.79
3,Mar,49.66,34.7,4.14,4.56
4,Apr,61.49,44.37,4.05,0.46
5,May,71.51,53.89,4.1,0.0
6,Jun,80.09,63.31,3.82,0.0
7,Jul,85.21,68.9,4.4,0.0
8,Aug,83.56,67.76,4.44,0.0
9,Sep,76.34,60.75,3.86,0.0
10,Oct,65.28,50.39,3.85,0.05


In [25]:
# Capturing the monthly extreme values
monthly_nyc_max_temps = nyc_weather_df.groupby(['mm']).max()['tmax degF']
monthly_nyc_min_temps = nyc_weather_df.groupby(['mm']).min()['tmin degF']
monthly_nyc_max_rain = nyc_weather_df.groupby(['mm']).max()['monthly total precip']
monthly_nyc_min_rain = nyc_weather_df.groupby(['mm']).min()['monthly total precip']
monthly_nyc_max_snow = nyc_weather_df.groupby(['mm']).max()['monthly total snowfall']
monthly_nyc_min_snow = nyc_weather_df.groupby(['mm']).min()['monthly total snowfall']

In [26]:
# Creating a DataFrame with the extremes
monthly_extremes_nyc_weather_df = pd.DataFrame({
    'max monthly avg high temp (F)': monthly_nyc_max_temps,
    'min monthly avg low temp (F)': monthly_nyc_min_temps,
    'max total rainfall (inches)': monthly_nyc_max_rain,
    'min total rainfall (inches)': monthly_nyc_min_rain,
    'max total snowfall (inches)': monthly_nyc_max_snow,
    'min total snowfall (inches)': monthly_nyc_min_snow})

# Removing the index name from the DataFrame
monthly_extremes_nyc_weather_df.index.name = None

# Adding the month names to the DataFrame
monthly_extremes_nyc_weather_df['Month'] = monthly_extremes_nyc_weather_df.index.map(month_dict)

# Reordering the columns
monthly_extremes_nyc_weather_df = monthly_extremes_nyc_weather_df[['Month', 'max monthly avg high temp (F)', 
                                            'min monthly avg low temp (F)', 'max total rainfall (inches)', 
                                            'min total rainfall (inches)', 'max total snowfall (inches)',
                                            'min total snowfall (inches)']]

# Formatting the values
monthly_extremes_nyc_weather_df['max monthly avg high temp (F)'] = monthly_extremes_nyc_weather_df['max monthly avg high temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['min monthly avg low temp (F)'] = monthly_extremes_nyc_weather_df['min monthly avg low temp (F)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['max total rainfall (inches)'] = monthly_extremes_nyc_weather_df['max total rainfall (inches)']\
    .map('{:.2f}'.format)
monthly_extremes_nyc_weather_df['min total rainfall (inches)'] = monthly_extremes_nyc_weather_df['min total rainfall (inches)']\
    .map('{:.2f}'.format)

# Displaying the updated DataFrame
monthly_extremes_nyc_weather_df

Unnamed: 0,Month,max monthly avg high temp (F),min monthly avg low temp (F),max total rainfall (inches),min total rainfall (inches),max total snowfall (inches),min total snowfall (inches)
1,Jan,49.1,16.4,10.52,0.58,36.0,0.0
2,Feb,49.6,15.8,6.69,0.71,36.9,0.0
3,Mar,59.2,26.3,10.69,0.8,21.1,0.0
4,Apr,67.3,39.1,14.01,1.27,9.6,0.0
5,May,78.9,46.9,10.24,0.57,0.0,0.0
6,Jun,85.6,58.8,10.26,0.02,0.0,0.0
7,Jul,90.3,64.3,11.77,0.44,0.0,0.0
8,Aug,88.3,62.7,18.95,0.18,0.0,0.0
9,Sep,82.6,54.0,11.51,0.48,0.0,0.0
10,Oct,72.1,45.1,16.73,0.14,2.9,0.0
