In [1]:
import numpy as np
import pandas as pd
import re
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
raw = pd.read_csv('data/weather_hk.csv')
df_hk=raw.copy()


In [3]:
#change 'Trace' to 0.05
#change Total rainfall to float type
df_hk['Total Rainfall(mm)']=df_hk['Total Rainfall(mm)'].map(lambda x: '0' if x=='Trace' else x)
df_hk['Total Rainfall(mm)']=df_hk['Total Rainfall(mm)'].astype(float)

In [4]:
#mark values with '#' as missing value
df_hk['PWD_missing']=df_hk['Prevailling Wind Direction(degrees)'].str.contains('#')
df_hk['MWS_missing']=df_hk['Mean Wind Speed(km/h)'].str.contains('#')

In [5]:
#remove # from numeric columns
df_hk['Prevailling Wind Direction(degrees)']=df_hk['Prevailling Wind Direction(degrees)'].str.extract(r'(\d+)')
df_hk['Prevailling Wind Direction(degrees)']=df_hk['Prevailling Wind Direction(degrees)'].astype(float)
df_hk['Mean Wind Speed(km/h)']=df_hk['Mean Wind Speed(km/h)'].str.extract(r'(\d+)')
df_hk['Mean Wind Speed(km/h)']=df_hk['Mean Wind Speed(km/h)'].astype(float)

In [6]:
#mark nan as missing value also
df_hk['PWD_missing']=(df_hk['PWD_missing'] | df_hk['Prevailling Wind Direction(degrees)'].isnull())

In [7]:
#create Date column in datetime format
df_hk['Date']=df_hk['Year'].map(str)+'-'+df_hk['Month'].map(str)+'-'+df_hk['Day'].map(str)
df_hk['Date']=pd.to_datetime(df_hk['Date'],format='%Y-%m-%d')

In [8]:
#Create weekly moving average for different feature
df_hk['humidity_sma_7']=df_hk.loc[:,'Mean Relative Humidity(%)'].rolling(window=7, min_periods=1).mean()
df_hk['rainfall_sma_7']=df_hk.loc[:,'Total Rainfall(mm)'].rolling(window=7, min_periods=1).mean()
df_hk['sunshine_sma_7']=df_hk.loc[:,'Total Bright Sunshine(hours)'].rolling(window=7, min_periods=1).mean()
df_hk['pressure_sma_7']=df_hk.loc[:,'Mean Pressure(hPa)'].rolling(window=7, min_periods=1).mean()
df_hk['cloud_sma_7']=df_hk.loc[:,'Mean Amount of Cloud(%)'].rolling(window=7, min_periods=1).mean()


In [9]:
def get_wind_direction(wind_degree):
    if wind_degree==np.nan:
        wind_degree='None'
    elif wind_degree%45==0:
        
        sector=wind_degree//45
        if sector==8 or sector ==0:
            direction='N'
        elif sector==1:
            direction='NE'
        elif sector==2:
            direction='E'
        elif sector==3:
            direction='SE'
        elif sector==4:
            direction='S'
        elif sector==5:
            direction='SW'
        elif sector==6:
            direction='W'
        else:
            direction='NW'
        
    else:
        sector=wind_degree//45
        if sector==0:
            direction='NNE'
        elif sector==1:
            direction='ENE'
        elif sector==2:
            direction='ESE'
        elif sector==3:
            direction='SSE'
        elif sector==4:
            direction='SSW'
        elif sector==5:
            direction='WSW'
        elif sector==6:
            direction='WNW'
        else:
            direction='NNW'
    return direction

In [10]:
#transform the wind direction degrees into categories
df_hk['wind_direction']=df_hk['Prevailling Wind Direction(degrees)'].apply(get_wind_direction)

In [11]:
#create columns for rainfall on next day
df_hk['rain_tmr_mm']=df_hk['Total Rainfall(mm)'].shift(-1)
df_hk['rain_tmr_class']=df_hk['rain_tmr_mm'].map(lambda x: 1 if x>1.0 else 0)

In [12]:
#drop year and day
df_hk.drop(columns=['Year','Day'],inplace=True)

In [13]:
def reorder_df_cols(df):
    reorder_columns=['Date','Month', 'Mean Pressure(hPa)', 'pressure_sma_7', 'Absolute Daily Max(deg. C)', 'Mean(deg. C)', 
                     'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)','Mean Relative Humidity(%)','humidity_sma_7',
                     'Mean Amount of Cloud(%)','cloud_sma_7', 'Total Bright Sunshine(hours)','sunshine_sma_7', 
                     'Prevailling Wind Direction(degrees)','PWD_missing','wind_direction', 'Mean Wind Speed(km/h)',
                     'MWS_missing', 'Total Rainfall(mm)', 'rainfall_sma_7', 'rain_tmr_mm','rain_tmr_class']
    df=df[reorder_columns]
    return df

df_hk=reorder_df_cols(df_hk)

In [14]:
df_hk

Unnamed: 0,Date,Month,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),Mean Relative Humidity(%),humidity_sma_7,...,sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,wind_direction,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
0,2000-01-01,1,1017.1,1017.100000,22.7,19.2,17.2,13.3,69,69.000000,...,9.800000,30.0,False,NNE,12.0,False,0.0,0.000000,0.0,0
1,2000-01-02,1,1017.2,1017.150000,23.6,20.4,17.8,13.1,64,66.500000,...,9.550000,10.0,False,NNE,9.0,False,0.0,0.000000,0.0,0
2,2000-01-03,1,1016.8,1017.033333,20.5,19.2,18.2,12.8,67,66.666667,...,9.366667,40.0,False,NNE,26.0,False,0.0,0.000000,0.0,0
3,2000-01-04,1,1016.1,1016.800000,20.5,18.9,17.5,14.0,74,68.500000,...,8.000000,40.0,False,NNE,25.0,False,0.0,0.000000,0.0,0
4,2000-01-05,1,1013.9,1016.220000,23.2,20.6,18.3,15.8,75,69.800000,...,7.740000,30.0,False,NNE,16.0,False,0.0,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7199,2019-12-27,12,1020.3,1017.314286,21.3,18.0,15.6,11.2,65,78.714286,...,6.271429,360.0,False,N,32.0,False,0.0,0.200000,0.0,0
7200,2019-12-28,12,1020.2,1017.657143,20.9,18.5,16.8,13.1,72,77.000000,...,6.714286,70.0,False,ENE,28.0,False,0.0,0.000000,9.3,1
7201,2019-12-29,12,1018.3,1018.014286,19.6,18.8,17.9,16.8,88,77.571429,...,5.928571,50.0,False,ENE,22.0,False,9.3,1.328571,0.3,0
7202,2019-12-30,12,1020.0,1018.485714,22.5,20.3,18.8,18.7,91,78.428571,...,5.885714,70.0,False,ENE,22.0,False,0.3,1.371429,0.0,0


In [15]:
#export weather data with data cleaning
# df_hk.to_csv('data/weather_hk_raw.csv')

## Data with train_test_split

In [16]:
train,test = train_test_split(df_hk, test_size=0.2)


In [17]:
imputer_1 = SimpleImputer(strategy='median')
train['Prevailling Wind Direction(degrees)']=imputer_1.fit_transform(train[['Prevailling Wind Direction(degrees)']])
test['Prevailling Wind Direction(degrees)']=imputer_1.transform(test[['Prevailling Wind Direction(degrees)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Prevailling Wind Direction(degrees)']=imputer_1.fit_transform(train[['Prevailling Wind Direction(degrees)']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Prevailling Wind Direction(degrees)']=imputer_1.transform(test[['Prevailling Wind Direction(degrees)']])


In [18]:
imputer_2 = SimpleImputer(strategy='median')
train['Mean Wind Speed(km/h)']=imputer_2.fit_transform(train[['Mean Wind Speed(km/h)']])
test['Mean Wind Speed(km/h)']=imputer_2.transform(test[['Mean Wind Speed(km/h)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Mean Wind Speed(km/h)']=imputer_2.fit_transform(train[['Mean Wind Speed(km/h)']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Mean Wind Speed(km/h)']=imputer_2.transform(test[['Mean Wind Speed(km/h)']])


In [19]:
train.columns

Index(['Date', 'Month', 'Mean Pressure(hPa)', 'pressure_sma_7',
       'Absolute Daily Max(deg. C)', 'Mean(deg. C)',
       'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)',
       'Mean Relative Humidity(%)', 'humidity_sma_7',
       'Mean Amount of Cloud(%)', 'cloud_sma_7',
       'Total Bright Sunshine(hours)', 'sunshine_sma_7',
       'Prevailling Wind Direction(degrees)', 'PWD_missing', 'wind_direction',
       'Mean Wind Speed(km/h)', 'MWS_missing', 'Total Rainfall(mm)',
       'rainfall_sma_7', 'rain_tmr_mm', 'rain_tmr_class'],
      dtype='object')

In [20]:
scaler = StandardScaler()
num_cols = ['Mean Pressure(hPa)', 'pressure_sma_7',
       'Absolute Daily Max(deg. C)', 'Mean(deg. C)',
       'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)',
       'Mean Relative Humidity(%)', 'humidity_sma_7',
       'Mean Amount of Cloud(%)', 'cloud_sma_7',
       'Total Bright Sunshine(hours)', 'sunshine_sma_7',
       'Prevailling Wind Direction(degrees)',
       'Mean Wind Speed(km/h)', 'Total Rainfall(mm)',
       'rainfall_sma_7']

scaler = StandardScaler().fit(train[num_cols].values)
train[num_cols] = scaler.transform(train[num_cols].values)
test[num_cols] = scaler.transform(test[num_cols].values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[num_cols] = scaler.transform(train[num_cols].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[num_cols] = scaler.transform(test[num_cols].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [21]:
#set date as index
train.set_index('Date',drop=True,inplace=True)
test.set_index('Date',drop=True,inplace=True)

In [22]:
y_train=train[['rain_tmr_mm','rain_tmr_class']]
y_test=test[['rain_tmr_mm','rain_tmr_class']]
X_train=train.iloc[:,:-2]
X_test=test.iloc[:,:-2]

In [23]:
# X_train.to_csv('data/train/f_hk_train.csv')
# X_test.to_csv('data/test/f_hk_test.csv')
# y_train.to_csv('data/train/t_hk_train.csv')
# y_test.to_csv('data/test/t_hk_test.csv')

### Data without train_test_split

In [24]:
timeorder=pd.concat([train,test],axis=0).sort_index()
timeorder

Unnamed: 0_level_0,Month,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),Mean Relative Humidity(%),humidity_sma_7,Mean Amount of Cloud(%),...,sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,wind_direction,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,1,0.677665,0.724011,-0.630918,-0.869160,-0.888914,-1.050807,-0.927303,-1.225652,-2.644698,...,1.894685,-0.891924,False,NNE,-1.037496,False,-0.314893,-0.598658,0.0,0
2000-01-02,1,0.693173,0.732281,-0.462133,-0.632696,-0.771275,-1.085060,-1.422912,-1.552844,-2.814386,...,1.794921,-1.109953,False,NNE,-1.338896,False,-0.314893,-0.598658,0.0,0
2000-01-03,1,0.631144,0.712985,-1.043503,-0.869160,-0.692848,-1.136439,-1.125547,-1.531031,-2.475010,...,1.721761,-0.782909,False,NNE,0.369041,False,-0.314893,-0.598658,0.0,0
2000-01-04,1,0.522594,0.674395,-1.043503,-0.928276,-0.830094,-0.930923,-0.431695,-1.291090,-0.353907,...,1.176386,-0.782909,False,NNE,0.268574,False,-0.314893,-0.598658,0.0,0
2000-01-05,1,0.181438,0.578469,-0.537149,-0.593286,-0.673242,-0.622649,-0.332573,-1.120950,-1.202348,...,1.072632,-0.891924,False,NNE,-0.635628,False,-0.314893,-0.598658,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,12,1.173893,0.759452,-0.893472,-1.105623,-1.202620,-1.410460,-1.323790,0.045724,-1.923523,...,0.486591,2.705564,False,N,0.971843,False,-0.314893,-0.580539,0.0,0
2019-12-28,12,1.158386,0.816157,-0.968487,-1.007097,-0.967341,-1.085060,-0.629938,-0.178637,-0.184219,...,0.663315,-0.455864,False,ENE,0.569975,False,-0.314893,-0.598658,9.3,1
2019-12-29,12,0.863751,0.875224,-1.212288,-0.947981,-0.751668,-0.451386,0.956010,-0.103850,1.003598,...,0.349772,-0.673894,False,ENE,-0.032827,False,0.122549,-0.478291,0.3,0
2019-12-30,12,1.127371,0.953193,-0.668426,-0.652402,-0.575209,-0.125985,1.253375,0.008330,0.918754,...,0.332670,-0.455864,False,ENE,-0.032827,False,-0.300782,-0.474408,0.0,0


In [25]:
#test size =0.8
test_size=int(timeorder.shape[0]*0.8)
train_to=timeorder[:test_size]
test_to=timeorder[test_size:]

#split into features and columns
y_train_to=train_to[['rain_tmr_mm','rain_tmr_class']]
y_test_to=test_to[['rain_tmr_mm','rain_tmr_class']]
X_train_to=train_to.iloc[:,:-2]
X_test_to=test_to.iloc[:,:-2]

#export to csv
X_train_to.to_csv('data/train/f_hk_train_to.csv')
X_test_to.to_csv('data/test/f_hk_test_to.csv')
y_train_to.to_csv('data/train/t_hk_train_to.csv')
y_test_to.to_csv('data/test/t_hk_test_to.csv')

In [26]:
train_to

Unnamed: 0_level_0,Month,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),Mean Relative Humidity(%),humidity_sma_7,Mean Amount of Cloud(%),...,sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,wind_direction,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,1,0.677665,0.724011,-0.630918,-0.869160,-0.888914,-1.050807,-0.927303,-1.225652,-2.644698,...,1.894685,-0.891924,False,NNE,-1.037496,False,-0.314893,-0.598658,0.0,0
2000-01-02,1,0.693173,0.732281,-0.462133,-0.632696,-0.771275,-1.085060,-1.422912,-1.552844,-2.814386,...,1.794921,-1.109953,False,NNE,-1.338896,False,-0.314893,-0.598658,0.0,0
2000-01-03,1,0.631144,0.712985,-1.043503,-0.869160,-0.692848,-1.136439,-1.125547,-1.531031,-2.475010,...,1.721761,-0.782909,False,NNE,0.369041,False,-0.314893,-0.598658,0.0,0
2000-01-04,1,0.522594,0.674395,-1.043503,-0.928276,-0.830094,-0.930923,-0.431695,-1.291090,-0.353907,...,1.176386,-0.782909,False,NNE,0.268574,False,-0.314893,-0.598658,0.0,0
2000-01-05,1,0.181438,0.578469,-0.537149,-0.593286,-0.673242,-0.622649,-0.332573,-1.120950,-1.202348,...,1.072632,-0.891924,False,NNE,-0.635628,False,-0.314893,-0.598658,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-01-16,1,0.119410,0.799618,-1.681134,-1.420908,-1.222226,-0.656901,1.649862,0.999255,1.046021,...,-1.257850,-0.564879,False,ENE,1.775578,False,0.263660,0.563597,24.6,1
2016-01-17,1,-0.190732,0.657856,-1.024749,-1.145034,-1.359473,-0.571270,1.154253,1.092738,0.452112,...,-1.166637,-1.000938,False,NNE,0.067640,False,0.842212,0.792683,0.0,0
2016-01-18,1,0.677665,0.672032,-1.549857,-1.657372,-1.928064,-1.427586,-0.233451,0.830985,-0.820550,...,-1.064023,-0.673894,False,ENE,0.569975,False,-0.314893,0.395342,0.0,0
2016-01-19,1,1.142879,0.676757,-1.587365,-1.420908,-1.261440,-1.187818,-0.035208,0.812288,0.664222,...,-1.041220,-0.564879,False,ENE,1.574644,False,-0.314893,0.395342,3.3,1


### dataframe with common columns as AUS

In [27]:
df_hk

Unnamed: 0,Date,Month,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),Mean Relative Humidity(%),humidity_sma_7,...,sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,wind_direction,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
0,2000-01-01,1,1017.1,1017.100000,22.7,19.2,17.2,13.3,69,69.000000,...,9.800000,30.0,False,NNE,12.0,False,0.0,0.000000,0.0,0
1,2000-01-02,1,1017.2,1017.150000,23.6,20.4,17.8,13.1,64,66.500000,...,9.550000,10.0,False,NNE,9.0,False,0.0,0.000000,0.0,0
2,2000-01-03,1,1016.8,1017.033333,20.5,19.2,18.2,12.8,67,66.666667,...,9.366667,40.0,False,NNE,26.0,False,0.0,0.000000,0.0,0
3,2000-01-04,1,1016.1,1016.800000,20.5,18.9,17.5,14.0,74,68.500000,...,8.000000,40.0,False,NNE,25.0,False,0.0,0.000000,0.0,0
4,2000-01-05,1,1013.9,1016.220000,23.2,20.6,18.3,15.8,75,69.800000,...,7.740000,30.0,False,NNE,16.0,False,0.0,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7199,2019-12-27,12,1020.3,1017.314286,21.3,18.0,15.6,11.2,65,78.714286,...,6.271429,360.0,False,N,32.0,False,0.0,0.200000,0.0,0
7200,2019-12-28,12,1020.2,1017.657143,20.9,18.5,16.8,13.1,72,77.000000,...,6.714286,70.0,False,ENE,28.0,False,0.0,0.000000,9.3,1
7201,2019-12-29,12,1018.3,1018.014286,19.6,18.8,17.9,16.8,88,77.571429,...,5.928571,50.0,False,ENE,22.0,False,9.3,1.328571,0.3,0
7202,2019-12-30,12,1020.0,1018.485714,22.5,20.3,18.8,18.7,91,78.428571,...,5.885714,70.0,False,ENE,22.0,False,0.3,1.371429,0.0,0


In [29]:
df_hk.columns

Index(['Date', 'Month', 'Mean Pressure(hPa)', 'pressure_sma_7',
       'Absolute Daily Max(deg. C)', 'Mean(deg. C)',
       'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)',
       'Mean Relative Humidity(%)', 'humidity_sma_7',
       'Mean Amount of Cloud(%)', 'cloud_sma_7',
       'Total Bright Sunshine(hours)', 'sunshine_sma_7',
       'Prevailling Wind Direction(degrees)', 'PWD_missing', 'wind_direction',
       'Mean Wind Speed(km/h)', 'MWS_missing', 'Total Rainfall(mm)',
       'rainfall_sma_7', 'rain_tmr_mm', 'rain_tmr_class'],
      dtype='object')

In [30]:
common_cols=['Month', 'Mean Pressure(hPa)','Absolute Daily Max(deg. C)', 'Mean(deg. C)',
            'Absolute Daily min(deg. C)','Mean Relative Humidity(%)','Mean Amount of Cloud(%)',
            'Total Bright Sunshine(hours)','Mean Wind Speed(km/h)','Total Rainfall(mm)',
            'rain_tmr_mm', 'rain_tmr_class']

In [32]:
df_hk_common=df_hk[common_cols]
df_hk_common.to_csv('data/weather_hk_with_common.csv',index=None)