# Data

[data source](https://www.kaggle.com/datasets/emmanuelfwerr/london-weather-data)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib import dates as mdates

# set visualization
fav_style = ('bmh','tableau-colorblind10')
plt.style.use(fav_style)

In [2]:
def format_xdate(ax,freq_y,freq_m):
    
    # Create a year locator
    loc_major_yr = mdates.YearLocator(freq_y)
    # Create a year formatter using 4-digit years
    fmt_major_yr = mdates.DateFormatter("%Y")


    # Create a month locator that will add months at 1,4,7,10
    loc_minor_mo = mdates.MonthLocator(interval=freq_m)
    

    
    # Set xaxis major locator/formatter
    ax.xaxis.set_major_locator(loc_major_yr)
    ax.xaxis.set_major_formatter(fmt_major_yr)


    # Set xaxis minor locator/formatter
    ax.xaxis.set_minor_locator(loc_minor_mo)
#     ax.xaxis.set_minor_formatter(fmt_minor_3m)

    

    # Add gridlines for major xaxis ticks
    ax.grid(which='major',axis='x',color='k',ls=':',lw=1)

    ## Rotate the major tick years using fig.autofmt_xdate
    fig = ax.get_figure()
    fig.autofmt_xdate(which='major', rotation=90,ha='center')
    return fig

# Load and Convert Data 

In [3]:
print(pd. __version__)

1.5.2


In [4]:
df = pd.read_csv('1-london_weather_MODIFIED - london_weather_MODIFIED.csv', quoting=csv.QUOTE_NONE, quotechar='"')
df

NameError: name 'csv' is not defined

In [None]:
#url = "https://docs.google.com/spreadsheets/d/1J2hEGA1-ZOdXOc1Go2AjB7xShq-nAvIDpBz_XRZscHU/edit?usp=sharing"
#df = pd.read_csv(url, quoting=csv.QUOTE_NONE, quotechar='"')
#df

In [None]:
# convert date to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['date']

In [None]:
# create a datetime index
df = df.set_index('date')
print(df.info())
df.head()

# Filter out only the required data

In [None]:
# Convert the date column to datetime dtype.
# Keep only data from the year 2000 or later & only have the selected features
df = df.loc['2000':, ['precipitation', 'mean_temp', 'min_temp', 
                      'max_temp', 'snow_depth']]
print(df.info())
df.head()

# Impute missing values

In [None]:
# check the missing values
df.isna().sum()

Plan on imputing 
- Interpolate the temperature columns
- Fill the precipitation and snow_depth columns with 0

In [None]:
## interpolating temperature measures
df['mean_temp'] = df['mean_temp'].interpolate()
df['max_temp'] = df['max_temp'].interpolate()
df['min_temp'] = df['min_temp'].interpolate()

# filling preciptation/snow with 0
df['precipitation'] = df['precipitation'].fillna(0)
df['snow_depth'] = df['snow_depth'].fillna(0)

In [None]:
# check the missing values again
df.isna().sum()

# Answer questions with visualizations

## 1. What month had the most precipitation between 2000 through 2010?

In [None]:
# resample precipitation to month
df_rain = df.loc['2000':'2010', ['precipitation']].copy()
df_rain.head()

In [None]:
# Resample the precipitation as Monthly frequency.
df_rain_mo = df_rain.resample("MS").sum()
df_rain_mo.sort_values(by='precipitation', ascending=False)

November, 2009 has the highest precipitation from 2000 to 2010



In [None]:
plt.rcParams['figure.figsize'] = (12,3)
fig, ax = plt.subplots()
ax.plot(df_rain_mo.index, df_rain_mo.values)

ax.axvspan('2009-11-01', '2009-11-30', color = '#f7797d', alpha = 0.5, 
           zorder = 0, label = 'Max monthly precipitation: November 2009')

ax.set_title('Precipitation for 2000-2010')
ax.set_ylabel('Inches')
fig.autofmt_xdate(which='major', rotation=90,ha='center')
ax.legend();
sns.move_legend(ax, "lower center", bbox_to_anchor=(.5, -0.4), ncol=3, title=None, frameon=False)
format_xdate(ax,1,3);

In [None]:
## Save to PNG
FOLDER = "PNG/"
isExist = os.path.exists(FOLDER)
    
if isExist == False:  
      os.makedirs(FOLDER, exist_ok=True)
        
fig.savefig(f'{FOLDER}/precipitation_monthly.png',bbox_inches='tight')

## 2: Which year between 2000-2020 had the coolest average temperature?

In [None]:
# slice avg temperature to 2000-2020
df_temp = df.loc['2000':'2020', ['mean_temp']].copy()
df_temp.head()

In [None]:
# Resample the avg temp as yearly frequency.
df_temp_yr = df_temp.resample("AS").mean()
df_temp_yr.sort_values(by='mean_temp', ascending=True)

The year of 2010 has the lowest yearly average temperature of 10.66 from 2000 to 2020.



In [None]:
plt.rcParams['figure.figsize'] = (12,3)
fig, ax = plt.subplots()
ax.plot(df_temp_yr.index, df_temp_yr.values)

ax.axvspan('2010-01-01', '2010-12-31', color = '#f7797d', alpha = 0.5, 
           zorder = 0, label = 'Min average temperature: 2010')

ax.set_title('Average Temperature')
ax.set_ylabel('Degrees')
ax.legend();
sns.move_legend(ax, "lower center", bbox_to_anchor=(.5, -0.4), ncol=3, title=None, frameon=False)

format_xdate(ax,5,1)

loc_minor_year = mdates.YearLocator(1)
ax.xaxis.set_minor_locator(loc_minor_year)

In [None]:
## Save to PNG
fig.savefig(f'{FOLDER}/avg_temp_yearly.png',bbox_inches='tight')