# **Madrid Daily Weather**

In [1]:
import pandas as pd
import os

In [2]:
# file imports

weather = pd.read_csv('./madrid-daily-weather-data/madrid-weather-data.csv')

## **Data Manipulation**

In [3]:
# converting string dates to datetime data type

weather['CET'] = pd.to_datetime(weather['CET'])

In [4]:
# renaming columns

rename_cols = {
    'CET' : 'Date',
    'Max TemperatureC' : 'Max Temp',
    'Mean TemperatureC' : 'Mean Temp',
    'Min TemperatureC' : 'Min Temp',
    ' Mean Humidity' : 'Mean Humidity',
    ' Min Humidity' : 'Min Humidity',
    ' Max Sea Level PressurehPa' : 'Max Sea Level PressurehPa',
    ' Mean Sea Level PressurehPa' : 'Mean Sea Level PressurehPa',
    ' Min Sea Level PressurehPa' : 'Min Sea Level PressurehPa',
    ' Max VisibilityKm' : 'Max Visibility',
    ' Mean VisibilityKm' : 'Mean Visibility',
    ' Min VisibilitykM' : 'Min Visibility',
    ' Max Wind SpeedKm/h' : 'Max Wind Speed',
    ' Mean Wind SpeedKm/h' : 'Mean Wind Speed',
    ' Max Gust SpeedKm/h' : 'Max Gust Speed',
    'Precipitationmm' : 'Precipitation',
    ' CloudCover' : 'CloudCover',
    ' Events' : 'Events'
}

weather.rename(columns=rename_cols, inplace=True)

In [5]:
# dropping the 29th February dates from dataset so that our dataset is consistent

weather.drop(
    weather.loc[(weather['Date'].dt.day == 29) & (weather['Date'].dt.month == 2)].index,
    inplace=True
)

In [6]:
# creating an ID columns that will be used later for joining the other dataframe

weather['ID'] = weather.index

In [7]:
# we need this for our points placement in tableau

range = pd.date_range('2021-01-01', '2021-12-31')
range = range.strftime('%m') + '-' + range.strftime('%d')
range = pd.DataFrame(range, columns=['tmp'])
range['Index'] = range.index

weather['tmp'] = weather['Date'].dt.strftime('%m') + '-' + weather['Date'].dt.strftime('%d')

weather = (
    weather.set_index('tmp')
    .join(
        range.set_index('tmp'))
    .reset_index()
    .sort_values('ID')
    .drop(columns=['tmp'])
)

In [8]:
# some correction

weather['Index'] = (weather['Date'].dt.year % 1997)*365 + weather['Index']

## **Exporting files**

In [9]:
# outputing the required columns as a separate csv file

try: os.mkdir('./clean-data')
except: pass

weather[[
    'ID',
    'Index',
    'Date',
    'Max Temp',
    'Mean Temp',
    'Min Temp',
    'Mean Humidity',
    'Mean Visibility',
    'Max Wind Speed',
    'Mean Wind Speed',
    'Max Gust Speed',
    'Precipitation'
]].to_csv('./clean-data/madrid-weather.csv', index=False)

In [10]:
# extract events from the string and unpivot/melt so that all the values are in one column

(
    weather[['ID']]
    .join(
        weather['Events']
        .str.split('-', expand=True))
    .melt(
        id_vars=['ID'],
        value_vars=[0, 1, 2])
    .rename(
        columns={'value' : 'Event'})
    [['ID', 'Event']]
    .dropna()
).to_csv('./clean-data/events.csv', index=False)