## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [1]:
import pandas as pd

In [2]:
weather = pd.read_csv('weather-raw.csv')
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [3]:
# weather.dtypes

In [4]:
weather.shape

(22, 35)

In [5]:
# Total missing values in the database
weather.isnull().sum().sum()

616

In [6]:
# Replace NaN with '---'
weather.fillna(value = '---', inplace = True)

In [7]:
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,---,---,---,---,---,---,...,---,---,---,---,---,---,---,---,27.8,---
1,MX17004,2010,1,tmin,---,---,---,---,---,---,...,---,---,---,---,---,---,---,---,14.5,---
2,MX17004,2010,2,tmax,---,27.3,24.1,---,---,---,...,---,29.9,---,---,---,---,---,---,---,---
3,MX17004,2010,2,tmin,---,14.4,14.4,---,---,---,...,---,10.7,---,---,---,---,---,---,---,---
4,MX17004,2010,3,tmax,---,---,---,---,32.1,---,...,---,---,---,---,---,---,---,---,---,---


In [8]:
# Tidy Data
# Variables stored both in rows and columns
# melt function > to create a row for each record for the day variable
weather=weather.melt(id_vars=["id","year","month","element"],var_name=["day"],value_name="temp")
weather.head()

Unnamed: 0,id,year,month,element,day,temp
0,MX17004,2010,1,tmax,d1,---
1,MX17004,2010,1,tmin,d1,---
2,MX17004,2010,2,tmax,d1,---
3,MX17004,2010,2,tmin,d1,---
4,MX17004,2010,3,tmax,d1,---


In [9]:
# Convert 'day' column from string(d1,d2...) to int(1,2...)
weather.update(pd.DataFrame({"day":[day[1:] for day in weather["day"]]}))
weather.head(3)

Unnamed: 0,id,year,month,element,day,temp
0,MX17004,2010,1,tmax,1,---
1,MX17004,2010,1,tmin,1,---
2,MX17004,2010,2,tmax,1,---


In [10]:
weather.tail(3)

Unnamed: 0,id,year,month,element,day,temp
679,MX17004,2010,11,tmin,31,---
680,MX17004,2010,12,tmax,31,---
681,MX17004,2010,12,tmin,31,---


In [11]:
# PIVOT table (rows-->columns)
weather=(weather.pivot_table(index = ["year","month","day","id"], columns = "element", values = "temp", aggfunc='first')
       .reset_index().rename_axis(None, axis = 1))
weather.head() 

Unnamed: 0,year,month,day,id,tmax,tmin
0,2010,1,1,MX17004,---,---
1,2010,1,10,MX17004,---,---
2,2010,1,11,MX17004,---,---
3,2010,1,12,MX17004,---,---
4,2010,1,13,MX17004,---,---


In [12]:
# DataError: No numeric types to aggregate
# Pivot function > aggfunc ='first'

In [13]:
# creating a date column
weather=(weather.assign(date = lambda x: x.year.astype("str") +"/"+ x.month.astype("str").str.zfill(2) +"/"+ x.day.astype("str").str.zfill(2))
            .drop(["year", "month","day"],axis=1)) 

weather.head()

Unnamed: 0,id,tmax,tmin,date
0,MX17004,---,---,2010/01/01
1,MX17004,---,---,2010/01/10
2,MX17004,---,---,2010/01/11
3,MX17004,---,---,2010/01/12
4,MX17004,---,---,2010/01/13


In [14]:
weather.tail()

Unnamed: 0,id,tmax,tmin,date
336,MX17004,---,---,2010/12/05
337,MX17004,27.8,10.5,2010/12/06
338,MX17004,---,---,2010/12/07
339,MX17004,---,---,2010/12/08
340,MX17004,---,---,2010/12/09


In [15]:
weather.shape

(341, 4)

In [16]:
# Eliminate null rows
wm = weather.mask(weather.eq('---')).dropna(subset=['tmin','tmax'])
wm

Unnamed: 0,id,tmax,tmin,date
23,MX17004,27.8,14.5,2010/01/30
33,MX17004,29.7,13.4,2010/02/11
42,MX17004,27.3,14.4,2010/02/02
46,MX17004,29.9,10.7,2010/02/23
53,MX17004,24.1,14.4,2010/02/03
63,MX17004,34.5,16.8,2010/03/10
69,MX17004,31.1,17.6,2010/03/16
88,MX17004,32.1,14.2,2010/03/05
112,MX17004,36.3,16.7,2010/04/27
143,MX17004,33.2,18.2,2010/05/27


In [17]:
wm.shape

(33, 4)

In [18]:
wm.to_csv('tidy_weather.csv', index=False)