In [1]:
import pandas as pd
import numpy as np
import tqdm
import tpot
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import dask_ml.model_selection

In [2]:
from dask.distributed import Client, Variable

In [3]:
df = pd.read_csv('assets/weatherAUS.csv') #membaca data

In [4]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
df.dtypes #memastikan tipe data sudah tepat

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

In [6]:
df['Date'] = df.Date.astype('datetime64[ns]') #mengganti tipe data Date

In [7]:
df.shape

(145460, 23)

In [8]:
train,test=train_test_split(df,train_size=0.8,test_size=0.2)

### Data preparation

#### 1.  Missing value Imputation

In [9]:
# df.dropna(inplace=True) 

In [10]:
train.dtypes

Date             datetime64[ns]
Location                 object
MinTemp                 float64
MaxTemp                 float64
Rainfall                float64
Evaporation             float64
Sunshine                float64
WindGustDir              object
WindGustSpeed           float64
WindDir9am               object
WindDir3pm               object
WindSpeed9am            float64
WindSpeed3pm            float64
Humidity9am             float64
Humidity3pm             float64
Pressure9am             float64
Pressure3pm             float64
Cloud9am                float64
Cloud3pm                float64
Temp9am                 float64
Temp3pm                 float64
RainToday                object
RainTomorrow             object
dtype: object

In [11]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

#membagi dataframe menjadi yang numerical dan categorical, untuk missing value imputation
#imputation dengan knn
num_train_df = train.select_dtypes(include=numerics)
cat_train_df = train.select_dtypes(exclude=numerics)

num_test_df = test.select_dtypes(include=numerics)
cat_test_df = test.select_dtypes(exclude=numerics)

In [12]:
num_train_df.head(2)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
90275,21.7,28.8,0.0,,,41.0,28.0,30.0,64.0,64.0,1014.4,1012.4,,,27.1,26.9
51441,10.6,26.0,0.0,,,26.0,9.0,13.0,68.0,39.0,1026.4,1023.1,,,15.8,25.1


In [13]:
cat_train_df.head(2)

Unnamed: 0,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
90275,2009-01-05,GoldCoast,SE,SSE,SE,No,No
51441,2015-10-14,Tuggeranong,N,NE,NNW,No,No


In [14]:
import sys
from impyute.imputation.cs import mice

In [15]:
#Numerical Data
#missing values imputation with mean median method to Train Data
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'
imp_mean.fit(num_train_df)
imputed_num_train_df = imp_mean.transform(num_train_df)

In [16]:
imputed_num_train_df

array([[21.7       , 28.8       ,  0.        , ...,  4.50771887,
        27.1       , 26.9       ],
       [10.6       , 26.        ,  0.        , ...,  4.50771887,
        15.8       , 25.1       ],
       [ 9.3       , 18.1       ,  0.        , ...,  4.50771887,
        10.4       , 16.5       ],
       ...,
       [15.2       , 20.3       ,  7.4       , ...,  8.        ,
        17.6       , 19.        ],
       [ 7.9       , 14.6       ,  4.        , ...,  4.50771887,
         9.6       , 12.        ],
       [13.2       , 20.        ,  0.2       , ...,  4.50771887,
        18.5       , 19.        ]])

In [17]:
num_train_df=pd.DataFrame(imputed_num_train_df,columns=num_test_df.columns.to_list())

In [18]:
num_train_df.shape

(116368, 16)

In [19]:
#Numerical Data
#missing values imputation with mean median method to Train Data

imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'
imp_mean.fit(num_test_df)
imputed_num_test_df = imp_mean.transform(num_test_df)

In [20]:
imputed_num_test_df

array([[12.        , 23.3       ,  0.        , ...,  6.        ,
        18.5       , 21.        ],
       [14.7       , 20.5       , 26.4       , ...,  8.        ,
        15.        , 19.6       ],
       [10.3       , 21.2       ,  0.        , ...,  4.51880093,
        12.5       , 20.3       ],
       ...,
       [11.6       , 17.9       ,  6.        , ...,  4.51880093,
        15.9       , 15.7       ],
       [15.4       , 35.5       ,  0.        , ...,  4.51880093,
        24.3       , 33.7       ],
       [16.8       , 24.6       ,  0.        , ...,  7.        ,
        18.4       , 23.6       ]])

In [21]:
num_test_df=pd.DataFrame(imputed_num_test_df,columns=num_test_df.columns.to_list())
num_test_df.shape

(29092, 16)

In [22]:
#categorical imputation using drop na

In [23]:
print(cat_train_df.isnull().sum().sum())
print(cat_test_df.isnull().sum().sum())

25210
6438


In [24]:
imp_most = SimpleImputer( strategy='most_frequent') #for median imputation replace 'mean' with 'median'
imp_most.fit(cat_train_df)
imputed_cat_train_df = imp_most.transform(cat_train_df)

In [25]:
cat_train_df=pd.DataFrame(imputed_cat_train_df,columns=cat_train_df.columns.to_list())

In [26]:
cat_train_df.isnull().sum().sum()

0

In [27]:
imp_most = SimpleImputer( strategy='most_frequent') #for median imputation replace 'mean' with 'median'
imp_most.fit(cat_test_df)
imputed_cat_test_df = imp_most.transform(cat_test_df)

In [28]:
cat_test_df=pd.DataFrame(imputed_cat_test_df,columns=cat_test_df.columns.to_list())

In [29]:
cat_test_df.isnull().sum().sum()

0

In [30]:
train = pd.concat([cat_train_df,num_train_df],axis=1)
train.shape

(116368, 23)

In [31]:
# train=train.reset_index(drop=True)
train=train.drop(columns=['Date'])

In [32]:
train.shape

(116368, 22)

In [33]:
test = pd.concat([cat_test_df,num_test_df],axis=1)
test.shape

(29092, 23)

In [34]:
test=test.drop(columns=['Date'])

In [35]:
test.shape

(29092, 22)

#### 2. Feature Encoding One Hot Encoder

In [36]:
# cat_df = df.select_dtypes(exclude=numerics)

In [37]:
from numpy import argmax
from sklearn.preprocessing import OneHotEncoder

In [38]:
def one_hot_encoder_two(data,feature,keep_first=True):

    one_hot_cols = pd.get_dummies(data[feature])
    
    for col in one_hot_cols.columns:
        one_hot_cols.rename({col:f'{feature}_'+col},axis=1,inplace=True)
    
    new_data = pd.concat([data,one_hot_cols],axis=1)
    new_data.drop(feature,axis=1,inplace=True)
    
    if keep_first == False:
        new_data=new_data.iloc[:,1:]
    
    return new_data

In [39]:
train_rain = train[["RainToday","RainTomorrow"]]
train=train.drop(columns=['RainToday','RainTomorrow'])

In [40]:
train_rain

Unnamed: 0,RainToday,RainTomorrow
0,No,No
1,No,No
2,No,Yes
3,No,Yes
4,Yes,No
...,...,...
116363,No,No
116364,No,No
116365,Yes,No
116366,Yes,Yes


In [41]:
for col in train.select_dtypes(include=['object']).columns:
    train=pd.concat([train,one_hot_encoder_two(train,col)],axis=1)

In [42]:
train=train.select_dtypes(exclude='object')
train=pd.concat([train,train_rain],axis=1)

In [43]:
train['RainTomorrow']=train['RainTomorrow'].map({'Yes':1,'No':0})
train['RainToday']=train['RainToday'].map({'Yes':1,'No':0})

In [44]:
train

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_WindDir3pm_S,WindDir3pm_WindDir3pm_SE,WindDir3pm_WindDir3pm_SSE,WindDir3pm_WindDir3pm_SSW,WindDir3pm_WindDir3pm_SW,WindDir3pm_WindDir3pm_W,WindDir3pm_WindDir3pm_WNW,WindDir3pm_WindDir3pm_WSW,RainToday,RainTomorrow
0,21.7,28.8,0.0,5.474748,7.618209,41.000000,28.00000,30.000000,64.0,64.000000,...,0,1,0,0,0,0,0,0,0,0
1,10.6,26.0,0.0,5.474748,7.618209,26.000000,9.00000,13.000000,68.0,39.000000,...,0,0,0,0,0,0,0,0,0,0
2,9.3,18.1,0.0,3.000000,5.900000,33.000000,0.00000,17.000000,93.0,59.000000,...,0,0,0,0,1,0,0,0,0,1
3,11.6,16.5,0.0,5.474748,7.618209,48.000000,22.00000,17.000000,56.0,68.000000,...,0,0,1,0,0,0,0,0,0,1
4,9.3,16.5,2.6,3.800000,12.200000,52.000000,37.00000,24.000000,51.0,41.000000,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116363,19.2,31.8,0.0,5.474748,7.618209,40.023595,4.00000,18.648059,69.0,51.531568,...,0,1,0,0,0,0,0,0,0,0
116364,10.3,25.3,0.0,5.474748,7.618209,26.000000,9.00000,9.000000,52.0,21.000000,...,0,0,0,0,0,0,0,0,0,0
116365,15.2,20.3,7.4,2.800000,0.000000,40.023595,14.04607,18.648059,90.0,84.000000,...,0,1,0,0,0,0,0,0,1,0
116366,7.9,14.6,4.0,5.474748,7.618209,59.000000,20.00000,28.000000,76.0,61.000000,...,0,0,0,0,0,0,0,1,1,1


In [45]:
test_rain = test[["RainToday","RainTomorrow"]]
test=test.drop(columns=['RainToday','RainTomorrow'])
test

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,SydneyAirport,WSW,NW,WSW,12.0,23.3,0.0,5.000000,9.500000,54.000000,19.0,35.0,37.000000,16.000000,1010.400000,1009.000000,5.000000,6.000000,18.5,21.0
1,Ballarat,NNW,WSW,W,14.7,20.5,26.4,5.442128,7.583045,33.000000,13.0,20.0,100.000000,83.000000,1016.900000,1016.500000,8.000000,8.000000,15.0,19.6
2,Ballarat,SSE,S,S,10.3,21.2,0.0,5.442128,7.583045,44.000000,22.0,26.0,78.000000,46.000000,1009.000000,1008.800000,4.000000,4.518801,12.5,20.3
3,Wollongong,SSE,SSW,SSE,10.3,16.5,0.0,5.442128,7.583045,35.000000,19.0,22.0,69.000000,78.000000,1018.200000,1018.100000,1.000000,7.000000,15.5,15.3
4,NorahHead,NE,NW,NE,11.5,23.2,0.0,5.442128,7.583045,41.000000,6.0,24.0,54.000000,60.000000,1022.200000,1017.400000,4.453708,4.518801,18.5,19.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29087,Newcastle,W,NW,NW,11.2,21.0,6.4,5.442128,7.583045,40.081861,28.0,28.0,55.000000,33.000000,1017.689659,1015.278306,6.000000,3.000000,15.0,20.8
29088,Cairns,SE,SSE,ESE,19.9,30.3,0.0,4.600000,11.100000,41.000000,20.0,28.0,59.000000,45.000000,1012.000000,1008.700000,1.000000,1.000000,27.2,29.3
29089,Witchcliffe,WSW,W,WSW,11.6,17.9,6.0,5.442128,7.583045,48.000000,20.0,17.0,69.003848,51.569325,1014.600000,1015.700000,4.453708,4.518801,15.9,15.7
29090,Portland,W,NNE,WNW,15.4,35.5,0.0,6.200000,8.600000,63.000000,17.0,24.0,64.000000,36.000000,1005.600000,1004.200000,4.453708,4.518801,24.3,33.7


In [46]:
for col in test.select_dtypes(include=['object']).columns:
    test=pd.concat([test,one_hot_encoder_two(test,col)],axis=1)

In [47]:
test=test.select_dtypes(exclude='object')
test=pd.concat([test,test_rain],axis=1)

In [48]:
test['RainTomorrow']=test['RainTomorrow'].map({'Yes':1,'No':0})
test['RainToday']=test['RainToday'].map({'Yes':1,'No':0})
test

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_WindDir3pm_S,WindDir3pm_WindDir3pm_SE,WindDir3pm_WindDir3pm_SSE,WindDir3pm_WindDir3pm_SSW,WindDir3pm_WindDir3pm_SW,WindDir3pm_WindDir3pm_W,WindDir3pm_WindDir3pm_WNW,WindDir3pm_WindDir3pm_WSW,RainToday,RainTomorrow
0,12.0,23.3,0.0,5.000000,9.500000,54.000000,19.0,35.0,37.000000,16.000000,...,0,0,0,0,0,0,0,1,0,0
1,14.7,20.5,26.4,5.442128,7.583045,33.000000,13.0,20.0,100.000000,83.000000,...,0,0,0,0,0,1,0,0,1,1
2,10.3,21.2,0.0,5.442128,7.583045,44.000000,22.0,26.0,78.000000,46.000000,...,1,0,0,0,0,0,0,0,0,0
3,10.3,16.5,0.0,5.442128,7.583045,35.000000,19.0,22.0,69.000000,78.000000,...,0,0,1,0,0,0,0,0,0,0
4,11.5,23.2,0.0,5.442128,7.583045,41.000000,6.0,24.0,54.000000,60.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29087,11.2,21.0,6.4,5.442128,7.583045,40.081861,28.0,28.0,55.000000,33.000000,...,0,0,0,0,0,0,0,0,1,0
29088,19.9,30.3,0.0,4.600000,11.100000,41.000000,20.0,28.0,59.000000,45.000000,...,0,0,0,0,0,0,0,0,0,0
29089,11.6,17.9,6.0,5.442128,7.583045,48.000000,20.0,17.0,69.003848,51.569325,...,0,0,0,0,0,0,0,1,1,1
29090,15.4,35.5,0.0,6.200000,8.600000,63.000000,17.0,24.0,64.000000,36.000000,...,0,0,0,0,0,0,1,0,0,0


In [49]:
test.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm',
       ...
       'WindDir3pm_WindDir3pm_S', 'WindDir3pm_WindDir3pm_SE',
       'WindDir3pm_WindDir3pm_SSE', 'WindDir3pm_WindDir3pm_SSW',
       'WindDir3pm_WindDir3pm_SW', 'WindDir3pm_WindDir3pm_W',
       'WindDir3pm_WindDir3pm_WNW', 'WindDir3pm_WindDir3pm_WSW', 'RainToday',
       'RainTomorrow'],
      dtype='object', length=1034)

In [50]:
# import seaborn as sns

In [51]:
# train=label_encoded_train_df.copy()
# test=label_encoded_test_df.copy()

In [52]:
# TP = TPOTClassifier(generations=3,population_size=10,cv=5,n_jobs=-1,config_dict=tpot.config.classifier_config_dict_light,use_dask=True)

In [53]:
train.to_csv('train_one_hot2.csv', index=False)
test.to_csv('test_one_hot2.csv', index=False)