## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [6]:
import pandas as pd
import numpy as np
import datetime

In [7]:
df = pd.read_csv('../weather-raw.csv')

In [8]:
print(df.head())

        id  year  month element  d1    d2    d3  d4    d5  d6 ...   d22   d23  \
0  MX17004  2010      1    tmax NaN   NaN   NaN NaN   NaN NaN ...   NaN   NaN   
1  MX17004  2010      1    tmin NaN   NaN   NaN NaN   NaN NaN ...   NaN   NaN   
2  MX17004  2010      2    tmax NaN  27.3  24.1 NaN   NaN NaN ...   NaN  29.9   
3  MX17004  2010      2    tmin NaN  14.4  14.4 NaN   NaN NaN ...   NaN  10.7   
4  MX17004  2010      3    tmax NaN   NaN   NaN NaN  32.1 NaN ...   NaN   NaN   

   d24  d25  d26  d27  d28  d29   d30  d31  
0  NaN  NaN  NaN  NaN  NaN  NaN  27.8  NaN  
1  NaN  NaN  NaN  NaN  NaN  NaN  14.5  NaN  
2  NaN  NaN  NaN  NaN  NaN  NaN   NaN  NaN  
3  NaN  NaN  NaN  NaN  NaN  NaN   NaN  NaN  
4  NaN  NaN  NaN  NaN  NaN  NaN   NaN  NaN  

[5 rows x 35 columns]


In [9]:
#Transform column names for days in numbers
day_raw = df.columns.str.extract("d(\d+)", expand=False)
day_raw=day_raw.dropna().unique().tolist()

In [10]:
print(day_raw)
print(type(day_raw))

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
<class 'list'>


In [11]:
# Add the 4 first columns names to the list
day_raw[0:0]=['id','year','month','element']

In [12]:
print(day_raw)

['id', 'year', 'month', 'element', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']


In [13]:
# Replace column names if the df 
df.columns=day_raw


In [14]:
df.columns

Index(['id', 'year', 'month', 'element', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31'],
      dtype='object')

In [15]:
#Melting df to get the temp by day and month

In [16]:
melted_df = pd.melt(df, id_vars=["id","year", "month","element"], 
                  var_name="day", value_name="Temp")

In [17]:
print(melted_df.head())

        id  year  month element day  Temp
0  MX17004  2010      1    tmax   1   NaN
1  MX17004  2010      1    tmin   1   NaN
2  MX17004  2010      2    tmax   1   NaN
3  MX17004  2010      2    tmin   1   NaN
4  MX17004  2010      3    tmax   1   NaN


In [18]:
# To numeric values
melted_df[["year","month","day"]] = melted_df[["year","month","day"]].apply(lambda x: pd.to_numeric(x, errors='ignore'))

In [19]:
#Creating a date column from the different columns
d = {'year':'year','month':'month','day':'day'}
melted_df['date'] = pd.to_datetime(melted_df[list(d.values())],format='%d-%m-%Y',errors='coerce')

In [20]:
print(melted_df.tail())

          id  year  month element  day  Temp       date
677  MX17004  2010     10    tmin   31   NaN 2010-10-31
678  MX17004  2010     11    tmax   31   NaN        NaT
679  MX17004  2010     11    tmin   31   NaN        NaT
680  MX17004  2010     12    tmax   31   NaN 2010-12-31
681  MX17004  2010     12    tmin   31   NaN 2010-12-31


In [21]:
#Drop year,, month and day columns
melted_df = melted_df.drop(['year',"month","day"], axis=1)
melted_df = melted_df.dropna()

In [22]:
# Unmelting column "element" to get two columns , tmax and tmin
melted_df = melted_df.pivot_table(index=["id","date"], columns="element", values="Temp")
melted_df.reset_index(drop=False, inplace=True)

In [23]:
display(melted_df.head(10))

element,id,date,tmax,tmin
0,MX17004,2010-01-30,27.8,14.5
1,MX17004,2010-02-02,27.3,14.4
2,MX17004,2010-02-03,24.1,14.4
3,MX17004,2010-02-11,29.7,13.4
4,MX17004,2010-02-23,29.9,10.7
5,MX17004,2010-03-05,32.1,14.2
6,MX17004,2010-03-10,34.5,16.8
7,MX17004,2010-03-16,31.1,17.6
8,MX17004,2010-04-27,36.3,16.7
9,MX17004,2010-05-27,33.2,18.2
