In [1]:
import numpy as np
import pandas as pd
import re
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
raw = pd.read_csv('data/weather_hk.csv')
df_hk=raw.copy()


In [3]:
#change 'Trace' to 0.05
#change Total rainfall to float type
df_hk['Total Rainfall(mm)']=df_hk['Total Rainfall(mm)'].map(lambda x: '0' if x=='Trace' else x)
df_hk['Total Rainfall(mm)']=df_hk['Total Rainfall(mm)'].astype(float)

In [4]:
#mark values with '#' as missing value
df_hk['PWD_missing']=df_hk['Prevailling Wind Direction(degrees)'].str.contains('#')
df_hk['MWS_missing']=df_hk['Mean Wind Speed(km/h)'].str.contains('#')

In [5]:
#remove # from numeric columns
df_hk['Prevailling Wind Direction(degrees)']=df_hk['Prevailling Wind Direction(degrees)'].str.extract(r'(\d+)')
df_hk['Prevailling Wind Direction(degrees)']=df_hk['Prevailling Wind Direction(degrees)'].astype(float)
df_hk['Mean Wind Speed(km/h)']=df_hk['Mean Wind Speed(km/h)'].str.extract(r'(\d+)')
df_hk['Mean Wind Speed(km/h)']=df_hk['Mean Wind Speed(km/h)'].astype(float)

In [6]:
#mark nan as missing value also
df_hk['PWD_missing']=(df_hk['PWD_missing'] | df_hk['Prevailling Wind Direction(degrees)'].isnull())

In [7]:
#create Date column in datetime format
df_hk['Date']=df_hk['Year'].map(str)+'-'+df_hk['Month'].map(str)+'-'+df_hk['Day'].map(str)
df_hk['Date']=pd.to_datetime(df_hk['Date'],format='%Y-%m-%d')

In [8]:
#Create weekly moving average for different feature
df_hk['humidity_sma_7']=df_hk.loc[:,'Mean Relative Humidity(%)'].rolling(window=7, min_periods=1).mean()
df_hk['rainfall_sma_7']=df_hk.loc[:,'Total Rainfall(mm)'].rolling(window=7, min_periods=1).mean()
df_hk['sunshine_sma_7']=df_hk.loc[:,'Total Bright Sunshine(hours)'].rolling(window=7, min_periods=1).mean()
df_hk['pressure_sma_7']=df_hk.loc[:,'Mean Pressure(hPa)'].rolling(window=7, min_periods=1).mean()
df_hk['cloud_sma_7']=df_hk.loc[:,'Mean Amount of Cloud(%)'].rolling(window=7, min_periods=1).mean()


In [9]:
def get_season(month):
    if 3<=month<=5:
        season='spring'
    elif 6<=month<=9:
        season='summer'
    elif 10<=month<=11:
        season='autumn'
    else: season='winter'
    return season

df_season=pd.get_dummies(df_hk['Month'].apply(get_season),drop_first=True)
df_hk=pd.concat([df_hk,df_season],axis=1)

In [10]:
def get_wind_direction(wind_degree):
    if wind_degree==np.nan:
        wind_degree='None'
    elif wind_degree%45==0:
        
        sector=wind_degree//45
        if sector==8 or sector ==0:
            direction='N'
        elif sector==1:
            direction='NE'
        elif sector==2:
            direction='E'
        elif sector==3:
            direction='SE'
        elif sector==4:
            direction='S'
        elif sector==5:
            direction='SW'
        elif sector==6:
            direction='W'
        else:
            direction='NW'
        
    else:
        sector=wind_degree//45
        if sector==0:
            direction='NNE'
        elif sector==1:
            direction='ENE'
        elif sector==2:
            direction='ESE'
        elif sector==3:
            direction='SSE'
        elif sector==4:
            direction='SSW'
        elif sector==5:
            direction='WSW'
        elif sector==6:
            direction='WNW'
        else:
            direction='NNW'
    return direction

In [11]:
#transform the wind direction degrees into categories
df_hk['wind_direction']=df_hk['Prevailling Wind Direction(degrees)'].apply(get_wind_direction)

In [12]:
#create columns for rainfall on next day
df_hk['rain_tmr_mm']=df_hk['Total Rainfall(mm)'].shift(-1)
df_hk['rain_tmr_class']=df_hk['rain_tmr_mm'].map(lambda x: 1 if x>1.0 else 0)

In [13]:
#drop year and day
df_hk.drop(columns=['Year','Day'],inplace=True)

In [14]:
def reorder_df_cols(df):
    reorder_columns=['Date','Month', 'spring', 'summer', 'winter','Mean Pressure(hPa)', 'pressure_sma_7', 'Absolute Daily Max(deg. C)', 'Mean(deg. C)', 
                     'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)','Mean Relative Humidity(%)','humidity_sma_7',
                     'Mean Amount of Cloud(%)','cloud_sma_7', 'Total Bright Sunshine(hours)','sunshine_sma_7', 
                     'Prevailling Wind Direction(degrees)','PWD_missing','wind_direction', 'Mean Wind Speed(km/h)',
                     'MWS_missing', 'Total Rainfall(mm)', 'rainfall_sma_7', 'rain_tmr_mm','rain_tmr_class']
    df=df[reorder_columns]
    return df

df_hk=reorder_df_cols(df_hk)

In [15]:
df_hk

Unnamed: 0,Date,Month,spring,summer,winter,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),...,sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,wind_direction,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
0,2000-01-01,1,0,0,1,1017.1,1017.100000,22.7,19.2,17.2,...,9.800000,30.0,False,NNE,12.0,False,0.0,0.000000,0.0,0
1,2000-01-02,1,0,0,1,1017.2,1017.150000,23.6,20.4,17.8,...,9.550000,10.0,False,NNE,9.0,False,0.0,0.000000,0.0,0
2,2000-01-03,1,0,0,1,1016.8,1017.033333,20.5,19.2,18.2,...,9.366667,40.0,False,NNE,26.0,False,0.0,0.000000,0.0,0
3,2000-01-04,1,0,0,1,1016.1,1016.800000,20.5,18.9,17.5,...,8.000000,40.0,False,NNE,25.0,False,0.0,0.000000,0.0,0
4,2000-01-05,1,0,0,1,1013.9,1016.220000,23.2,20.6,18.3,...,7.740000,30.0,False,NNE,16.0,False,0.0,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7199,2019-12-27,12,0,0,1,1020.3,1017.314286,21.3,18.0,15.6,...,6.271429,360.0,False,N,32.0,False,0.0,0.200000,0.0,0
7200,2019-12-28,12,0,0,1,1020.2,1017.657143,20.9,18.5,16.8,...,6.714286,70.0,False,ENE,28.0,False,0.0,0.000000,9.3,1
7201,2019-12-29,12,0,0,1,1018.3,1018.014286,19.6,18.8,17.9,...,5.928571,50.0,False,ENE,22.0,False,9.3,1.328571,0.3,0
7202,2019-12-30,12,0,0,1,1020.0,1018.485714,22.5,20.3,18.8,...,5.885714,70.0,False,ENE,22.0,False,0.3,1.371429,0.0,0


In [16]:
df_hk['rain_tmr_class'].value_counts()

0    5210
1    1994
Name: rain_tmr_class, dtype: int64

In [17]:
#export weather data with data cleaning
df_hk.to_csv('data/weather_hk_raw.csv')

## Data with train_test_split

In [18]:
train,test = train_test_split(df_hk, test_size=0.2)

In [19]:
imputer_1 = SimpleImputer(strategy='median')
train['Prevailling Wind Direction(degrees)']=imputer_1.fit_transform(train[['Prevailling Wind Direction(degrees)']])
test['Prevailling Wind Direction(degrees)']=imputer_1.transform(test[['Prevailling Wind Direction(degrees)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Prevailling Wind Direction(degrees)']=imputer_1.fit_transform(train[['Prevailling Wind Direction(degrees)']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Prevailling Wind Direction(degrees)']=imputer_1.transform(test[['Prevailling Wind Direction(degrees)']])


In [20]:
imputer_2 = SimpleImputer(strategy='median')
train['Mean Wind Speed(km/h)']=imputer_2.fit_transform(train[['Mean Wind Speed(km/h)']])
test['Mean Wind Speed(km/h)']=imputer_2.transform(test[['Mean Wind Speed(km/h)']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Mean Wind Speed(km/h)']=imputer_2.fit_transform(train[['Mean Wind Speed(km/h)']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Mean Wind Speed(km/h)']=imputer_2.transform(test[['Mean Wind Speed(km/h)']])


In [21]:
train.columns

Index(['Date', 'Month', 'spring', 'summer', 'winter', 'Mean Pressure(hPa)',
       'pressure_sma_7', 'Absolute Daily Max(deg. C)', 'Mean(deg. C)',
       'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)',
       'Mean Relative Humidity(%)', 'humidity_sma_7',
       'Mean Amount of Cloud(%)', 'cloud_sma_7',
       'Total Bright Sunshine(hours)', 'sunshine_sma_7',
       'Prevailling Wind Direction(degrees)', 'PWD_missing', 'wind_direction',
       'Mean Wind Speed(km/h)', 'MWS_missing', 'Total Rainfall(mm)',
       'rainfall_sma_7', 'rain_tmr_mm', 'rain_tmr_class'],
      dtype='object')

In [22]:
#standardize numerical features
scaler = StandardScaler()
num_cols = ['Mean Pressure(hPa)', 'pressure_sma_7',
       'Absolute Daily Max(deg. C)', 'Mean(deg. C)',
       'Absolute Daily min(deg. C)', 'Mean Dew Point(deg. C)',
       'Mean Relative Humidity(%)', 'humidity_sma_7',
       'Mean Amount of Cloud(%)', 'cloud_sma_7',
       'Total Bright Sunshine(hours)', 'sunshine_sma_7',
       'Prevailling Wind Direction(degrees)',
       'Mean Wind Speed(km/h)', 'Total Rainfall(mm)',
       'rainfall_sma_7']

scaler = StandardScaler().fit(train[num_cols].values)
train[num_cols] = scaler.transform(train[num_cols].values)
test[num_cols] = scaler.transform(test[num_cols].values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[num_cols] = scaler.transform(train[num_cols].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[num_cols] = scaler.transform(test[num_cols].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [23]:
#set date as index
train.set_index('Date',drop=True,inplace=True)
test.set_index('Date',drop=True,inplace=True)
#drop categorical columns for SMOTE
train.drop(columns='wind_direction',axis=1,inplace=True)
test.drop(columns='wind_direction',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [24]:
#set features and target
y_train=train[['rain_tmr_class']]
y_test=test[['rain_tmr_class']]
X_train=train.iloc[:,:-2]
X_test=test.iloc[:,:-2]

In [25]:
X_train

Unnamed: 0_level_0,Month,spring,summer,winter,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),...,Mean Amount of Cloud(%),cloud_sma_7,Total Bright Sunshine(hours),sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-11,1,0,0,1,1.693203,1.247211,-1.354285,-1.451737,-1.488002,-1.212529,...,-1.543324,-0.730363,1.202226,-0.287066,-0.460054,False,0.674647,False,-0.319495,-0.599118
2014-07-30,7,0,1,0,-1.174830,-1.114756,1.421236,1.342179,1.250697,1.121009,...,-1.458561,-0.171884,1.408536,0.856603,-0.896265,False,-1.444808,False,-0.319495,-0.011810
2007-07-18,7,0,1,0,-0.743084,-1.472559,1.196194,1.243801,1.250697,1.001777,...,0.660502,0.098894,0.196465,1.411283,0.957632,False,0.775573,False,-0.143554,-0.513013
2013-03-07,3,1,0,0,1.091841,1.329061,-0.210321,-0.625368,-0.881576,-1.127363,...,-2.814761,-1.043450,1.434325,0.319078,-0.787213,False,-1.343882,False,-0.319495,-0.587551
2015-08-24,8,0,1,0,-1.591158,-1.458527,1.346222,1.401205,1.289821,0.831446,...,-0.865223,-0.755749,0.712240,1.119647,-1.114371,False,-1.142029,False,-0.319495,-0.477030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009-10-28,10,0,0,0,0.798870,0.204204,0.352285,0.181326,0.272590,-0.020210,...,-1.204273,-1.381922,0.866973,1.274042,-0.132896,False,0.270941,False,-0.319495,-0.599118
2017-07-24,7,0,1,0,-1.113152,-0.714858,0.971151,0.850292,0.800768,1.086943,...,0.236689,0.835072,-0.113000,-0.086924,-0.241949,False,-0.334618,False,-0.162574,1.999429
2001-01-31,1,0,0,1,0.351703,0.583054,-1.410546,-1.314009,-1.155445,-0.939999,...,0.830027,1.258162,-1.247705,-1.968260,-0.569107,False,0.371867,False,-0.319495,-0.005385
2002-05-24,5,1,0,0,-0.218819,-1.042260,0.408546,0.594511,0.663833,0.388585,...,0.872408,1.249700,-0.809296,-1.413581,-0.241949,False,-0.031838,False,-0.319495,0.222085


In [26]:
X_train.to_csv('data/train/f_hk_train_no_smote.csv',index=False)
X_test.to_csv('data/test/f_hk_test_no_smote.csv',index=False)
y_train.to_csv('data/train/t_hk_train_no_smote.csv',index=False)
y_test.to_csv('data/test/t_hk_test_no_smote.csv',index=False)

In [27]:
#oversampling
oversample = SMOTE()
X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)

In [28]:
X_resampled

Unnamed: 0,Month,spring,summer,winter,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),...,Mean Amount of Cloud(%),cloud_sma_7,Total Bright Sunshine(hours),sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7
0,1,0,0,1,1.693203,1.247211,-1.354285,-1.451737,-1.488002,-1.212529,...,-1.543324,-0.730363,1.202226,-0.287066,-0.460054,False,0.674647,False,-0.319495,-0.599118
1,7,0,1,0,-1.174830,-1.114756,1.421236,1.342179,1.250697,1.121009,...,-1.458561,-0.171884,1.408536,0.856603,-0.896265,False,-1.444808,False,-0.319495,-0.011810
2,7,0,1,0,-0.743084,-1.472559,1.196194,1.243801,1.250697,1.001777,...,0.660502,0.098894,0.196465,1.411283,0.957632,False,0.775573,False,-0.143554,-0.513013
3,3,1,0,0,1.091841,1.329061,-0.210321,-0.625368,-0.881576,-1.127363,...,-2.814761,-1.043450,1.434325,0.319078,-0.787213,False,-1.343882,False,-0.319495,-0.587551
4,8,0,1,0,-1.591158,-1.458527,1.346222,1.401205,1.289821,0.831446,...,-0.865223,-0.755749,0.712240,1.119647,-1.114371,False,-1.142029,False,-0.319495,-0.477030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8331,6,0,1,0,-1.057077,-0.885451,1.054688,0.924346,0.961534,1.077344,...,0.416226,0.681683,0.436077,-0.289029,-0.351001,True,-0.031838,False,-0.290959,1.128760
8332,8,0,1,0,-0.617440,-1.161606,0.367077,0.598667,0.719486,0.944439,...,0.669454,-0.295768,-0.855600,-0.059631,-0.012325,False,-0.839250,False,0.158661,0.924035
8333,3,1,0,0,0.471794,0.551949,-0.243287,-0.590792,-0.956274,-0.617412,...,-0.143456,0.308647,0.534080,-0.023325,-0.688084,False,0.274003,False,-0.228861,-0.572676
8334,6,0,1,0,-0.565859,-0.642108,1.239572,1.283152,1.292883,1.023735,...,0.272437,0.628622,1.198807,0.507932,1.144207,False,0.082471,False,-0.282312,0.498488


In [29]:
X_resampled.to_csv('data/train/f_hk_train.csv',index=False)
X_test.to_csv('data/test/f_hk_test.csv',index=False)
y_resampled.to_csv('data/train/t_hk_train.csv',index=False)
y_test.to_csv('data/test/t_hk_test.csv',index=False)

### Data without train_test_split

In [30]:
timeorder=pd.concat([train,test],axis=0).sort_index()
timeorder

Unnamed: 0_level_0,Month,spring,summer,winter,Mean Pressure(hPa),pressure_sma_7,Absolute Daily Max(deg. C),Mean(deg. C),Absolute Daily min(deg. C),Mean Dew Point(deg. C),...,Total Bright Sunshine(hours),sunshine_sma_7,Prevailling Wind Direction(degrees),PWD_missing,Mean Wind Speed(km/h),MWS_missing,Total Rainfall(mm),rainfall_sma_7,rain_tmr_mm,rain_tmr_class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,1,0,0,1,0.675514,0.716353,-0.622898,-0.861473,-0.881576,-1.042198,...,1.228015,1.908779,-0.896265,False,-1.041103,False,-0.319495,-0.599118,0.0,0
2000-01-02,1,0,0,1,0.690933,0.724538,-0.454116,-0.625368,-0.764203,-1.076264,...,1.099071,1.808708,-1.114371,False,-1.343882,False,-0.319495,-0.599118,0.0,0
2000-01-03,1,0,0,1,0.629255,0.705440,-1.035476,-0.861473,-0.685954,-1.127363,...,1.021705,1.735322,-0.787213,False,0.371867,False,-0.319495,-0.599118,0.0,0
2000-01-04,1,0,0,1,0.521318,0.667243,-1.035476,-0.920500,-0.822889,-0.922966,...,-0.293521,1.188267,-0.787213,False,0.270941,False,-0.319495,-0.599118,0.0,0
2000-01-05,1,0,0,1,0.182089,0.572296,-0.529130,-0.586017,-0.666392,-0.616370,...,0.428564,1.084193,-0.896265,False,-0.637397,False,-0.319495,-0.599118,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,12,0,0,1,1.168939,0.751432,-0.885447,-1.097579,-1.194570,-1.399893,...,1.124860,0.496347,2.702477,False,0.977426,False,-0.319495,-0.581126,0.0,0
2019-12-28,12,0,0,1,1.153519,0.807558,-0.960461,-0.999202,-0.959824,-1.076264,...,0.273831,0.673616,-0.460054,False,0.573720,False,-0.319495,-0.599118,9.3,1
2019-12-29,12,0,0,1,0.860548,0.866022,-1.204257,-0.940175,-0.744641,-0.446039,...,-1.299282,0.359107,-0.678160,False,-0.031838,False,0.122736,-0.479600,0.3,0
2019-12-30,12,0,0,1,1.122680,0.943195,-0.660405,-0.645043,-0.568581,-0.122409,...,-1.299282,0.341952,-0.460054,False,-0.031838,False,-0.305229,-0.475745,0.0,0


In [31]:
#test size =0.8
test_size=int(timeorder.shape[0]*0.8)
train_to=timeorder[:test_size]
test_to=timeorder[test_size:]

#split into features and columns
y_train_to=train_to[['rain_tmr_class']]
y_test_to=test_to[['rain_tmr_class']]
X_train_to=train_to.iloc[:,:-2]
X_test_to=test_to.iloc[:,:-2]

#oversampling
oversample = SMOTE()
X_resampled_to, y_resampled_to = oversample.fit_resample(X_train_to, y_train_to)

#export to csv
X_resampled_to.to_csv('data/train/f_hk_train_to.csv')
X_test_to.to_csv('data/test/f_hk_test_to.csv')
y_resampled_to.to_csv('data/train/t_hk_train_to.csv')
y_test_to.to_csv('data/test/t_hk_test_to.csv')

In [32]:
X_resampled_to.shape

(8422, 22)

In [33]:
y_resampled_to['rain_tmr_class'].value_counts()

1    4211
0    4211
Name: rain_tmr_class, dtype: int64

### dataframe with common columns as AUS

In [34]:
common_cols=['Month', 'spring', 'summer', 'winter', 'Mean Pressure(hPa)','Absolute Daily Max(deg. C)', 'Mean(deg. C)',
            'Absolute Daily min(deg. C)','Mean Relative Humidity(%)','Mean Amount of Cloud(%)',
            'Total Bright Sunshine(hours)','Mean Wind Speed(km/h)','Total Rainfall(mm)',
            'rain_tmr_mm', 'rain_tmr_class']