In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

In [2]:
DATAFILE = "./dataset/weather_data.csv"
TESTDATASET = "./dataset/unknown_data.csv"
dataset = pd.read_csv(DATAFILE)
testDataset =pd.read_csv(TESTDATASET)

In [3]:
dataset.head()

Unnamed: 0,row ID,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Row0,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0
1,Row1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0
2,Row2,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0
3,Row3,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,0
4,Row4,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,0


In [4]:
dataset.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
count,99073.0,99286.0,98537.0,56985.0,52199.0,93036.0,98581.0,97681.0,98283.0,97010.0,89768.0,89780.0,61944.0,59514.0,98902.0,97612.0,99516.0
mean,12.176266,23.218513,2.353024,5.46132,7.61509,39.976966,14.004849,18.650464,68.866376,51.433296,1017.684638,1015.286204,4.447985,4.519122,16.970041,21.68134,0.224677
std,6.390882,7.115072,8.487866,4.16249,3.783008,13.581524,8.902323,8.801827,19.074951,20.777616,7.110166,7.045189,2.88658,2.716618,6.488961,6.931681,0.417372
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,978.2,0.0,0.0,-7.0,-5.1,0.0
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1013.0,1010.5,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.7,1015.3,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,65.0,1022.4,1020.0,7.0,7.0,21.5,26.4,0.0
max,33.9,48.1,371.0,86.2,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,1.0


In [5]:
dataset.count().sort_values()

Sunshine         52199
Evaporation      56985
Cloud3pm         59514
Cloud9am         61944
Pressure9am      89768
Pressure3pm      89780
WindDir9am       92510
WindGustDir      92995
WindGustSpeed    93036
WindDir3pm       96868
Humidity3pm      97010
Temp3pm          97612
WindSpeed3pm     97681
Humidity9am      98283
Rainfall         98537
RainToday        98537
WindSpeed9am     98581
Temp9am          98902
MinTemp          99073
MaxTemp          99286
row ID           99516
Location         99516
RainTomorrow     99516
dtype: int64

In [6]:
dataset.shape

(99516, 23)

## First, the RainToday attribute is an binary value where it only store Yes/No for answer. So I will manually binarise the RainToday attribute using replace() function in pd

## Drop these columns (Sunshine, Evaporation, Cloud3pm/9am) as they have to much NA value

In [7]:
dataset = dataset.drop(columns=['Sunshine','Evaporation','row ID'],axis=1) 
testDataset = testDataset.drop(columns=['Sunshine','Evaporation','row ID'],axis=1) 
print(dataset.shape)
print(testDataset.shape)

(99516, 20)
(42677, 19)


# Impute the data for the testDataset

In [8]:
testDataset.shape

(42677, 19)

In [9]:
dataset.isna().sum()
testDataset.isna().sum()

Location             0
MinTemp            194
MaxTemp             92
Rainfall           427
WindGustDir       2809
WindGustSpeed     2790
WindDir9am        3007
WindDir3pm        1130
WindSpeed9am       413
WindSpeed3pm       795
Humidity9am        541
Humidity3pm       1104
Pressure9am       4266
Pressure3pm       4245
Cloud9am         16085
Cloud3pm         17092
Temp9am            290
Temp3pm            822
RainToday          427
dtype: int64

In [10]:
dataset.shape

(99516, 20)

# Performing standardization the data

In [11]:
from sklearn.preprocessing import StandardScaler

# create a scaler object
std_scaler = StandardScaler()


def returnListOfNumericCols(data):
    numericColumns = []

    for col in data.columns:
        if(data[col].dtype == 'float64'): # if the type of the col is numeric
            numericColumns.append(col)

    return numericColumns

In [12]:
# Importing the SimpleImputer class
from sklearn.impute import SimpleImputer
 
imputer = SimpleImputer(missing_values = np.nan,strategy ='median')
 
 
print(testDataset)
for col in returnListOfNumericCols(dataset):
        dataset[col] = imputer.fit_transform(dataset[col].values.reshape(-1, 1))
        testDataset[col] = imputer.fit_transform(testDataset[col].values.reshape(-1, 1))
# Imputing the data    
 
print("Imputed Data : \n", testDataset)

      Location  MinTemp  MaxTemp  Rainfall WindGustDir  WindGustSpeed  \
0       Albury     12.9     25.7       0.0         WSW           46.0   
1       Albury      9.2     28.0       0.0          NE           24.0   
2       Albury     14.3     25.0       0.0           W           50.0   
3       Albury      9.7     31.9       0.0         NNW           80.0   
4       Albury     15.9     18.6      15.6           W           61.0   
...        ...      ...      ...       ...         ...            ...   
42672    Uluru      2.4     19.1       0.0           E           33.0   
42673    Uluru      2.3     21.4       0.0          SE           22.0   
42674    Uluru      2.6     22.5       0.0           S           19.0   
42675    Uluru      7.4     20.6       0.0           E           35.0   
42676    Uluru      7.8     27.0       0.0          SE           28.0   

      WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  Humidity9am  \
0              W        WSW          19.0          

In [13]:
print(dataset.isna().sum())

Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
WindGustDir      6521
WindGustSpeed       0
WindDir9am       7006
WindDir3pm       2648
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday         979
RainTomorrow        0
dtype: int64


In [14]:
from scipy import stats
z = np.abs(stats.zscore(dataset._get_numeric_data()))
print(z)
dataset= dataset[(z < 3).all(axis=1)]
print(dataset.shape)

        MinTemp   MaxTemp  Rainfall  WindGustSpeed  WindSpeed9am  \
0      0.192033  0.044616  0.204739       0.311149      0.677651   
1      0.748905  0.264943  0.275752       0.311149      1.128037   
2      0.835006  1.278047  0.157397       0.082735      0.789470   
3      0.380220  0.912204  0.252081       1.224806      0.564796   
4      0.701858  0.490077  0.275752       0.374093      0.902326   
...         ...       ...       ...            ...           ...   
99511  0.654811  0.354176  0.275752       0.082735      0.564796   
99512  1.360514  0.199396  0.275752       0.678645      0.113374   
99513  1.470290  0.025738  0.275752       0.678645      0.112337   
99514  1.344832  0.293085  0.275752       1.363887      0.112337   
99515  1.062551  0.518219  0.275752       0.221817      0.563759   

       WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  \
0          0.612713     0.111811     1.435457     1.478788     1.223547   
1          0.383365     1.312485 

In [15]:
dataset.isna().sum()

Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
WindGustDir      6210
WindGustSpeed       0
WindDir9am       6841
WindDir3pm       2533
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday         927
RainTomorrow        0
dtype: int64

In [16]:


for col in returnListOfNumericCols(dataset):
    scalerData = std_scaler.fit_transform(dataset[col].values.reshape(-1, 1))
    scalerTestData = std_scaler.fit_transform(testDataset[col].values.reshape(-1, 1))
    
    dataset[col] =  scalerData
    testDataset[col] = scalerTestData
print(dataset.shape)
print(testDataset.shape)

(94628, 20)
(42677, 19)


In [17]:
dataset.isna().sum()

Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
WindGustDir      6210
WindGustSpeed       0
WindDir9am       6841
WindDir3pm       2533
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday         927
RainTomorrow        0
dtype: int64

In [18]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,0.204475,-0.051249,-0.221288,W,0.435522,W,WNW,0.796001,0.70974,0.113113,-1.450971,-1.604171,-1.325052,1.47387,0.151628,-0.004579,0.011567,No,0
1,Albury,-0.742799,0.265637,-0.377757,WNW,0.435522,NNW,WSW,-1.157264,0.464528,-1.348046,-1.301398,-1.152919,-1.215662,0.164671,0.151628,0.042473,0.384396,No,0
2,Albury,0.851779,1.302718,-0.116976,W,0.180056,ENE,NW,-0.791027,0.219316,0.7084,-0.902534,-1.121798,-1.496951,1.037471,1.57042,0.136578,1.189707,No,0
3,Albury,0.39393,0.928217,-0.325601,WNW,1.457385,W,W,0.673922,0.70974,-0.752759,-1.401114,-1.370765,-1.590714,0.164671,0.151628,0.575736,1.070402,No,0
4,Albury,-0.695435,0.496099,-0.377757,W,-0.330875,SSE,W,-0.913106,-0.148502,-1.131578,-1.600545,-0.717227,-0.856237,0.164671,0.151628,-0.098685,0.563354,No,0


# Binarise the categories data

In [19]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,0.204475,-0.051249,-0.221288,W,0.435522,W,WNW,0.796001,0.70974,0.113113,-1.450971,-1.604171,-1.325052,1.47387,0.151628,-0.004579,0.011567,No,0
1,Albury,-0.742799,0.265637,-0.377757,WNW,0.435522,NNW,WSW,-1.157264,0.464528,-1.348046,-1.301398,-1.152919,-1.215662,0.164671,0.151628,0.042473,0.384396,No,0
2,Albury,0.851779,1.302718,-0.116976,W,0.180056,ENE,NW,-0.791027,0.219316,0.7084,-0.902534,-1.121798,-1.496951,1.037471,1.57042,0.136578,1.189707,No,0
3,Albury,0.39393,0.928217,-0.325601,WNW,1.457385,W,W,0.673922,0.70974,-0.752759,-1.401114,-1.370765,-1.590714,0.164671,0.151628,0.575736,1.070402,No,0
4,Albury,-0.695435,0.496099,-0.377757,W,-0.330875,SSE,W,-0.913106,-0.148502,-1.131578,-1.600545,-0.717227,-0.856237,0.164671,0.151628,-0.098685,0.563354,No,0


In [20]:
testDataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,Albury,0.107664,0.345048,-0.27701,WSW,0.460943,W,WSW,0.567883,0.846481,-1.632082,-1.050313,-1.483403,-0.975261,0.159471,-1.258412,0.6132,0.219417,No
1,Albury,-0.468892,0.668268,-0.27701,NE,-1.21132,SE,E,-0.338135,-1.102042,-1.261232,-1.730816,0.002462,-0.359481,0.159471,0.150312,0.165641,0.69875,No
2,Albury,0.32582,0.246677,-0.27701,W,0.764991,SW,W,0.681135,0.617243,-1.049318,-1.584994,-1.18623,-1.050356,-1.581683,0.150312,0.165641,0.422771,No
3,Albury,-0.390979,1.216337,-0.27701,NNW,3.045351,SE,NW,-0.791145,1.075719,-1.420168,-2.071068,-1.29024,-1.74123,0.159471,0.150312,0.196507,1.236184,No
4,Albury,0.575141,-0.652718,1.586107,W,1.601123,NNW,NNW,1.587154,1.075719,0.381101,2.011954,-3.459602,-3.333245,1.465336,1.559036,0.057609,-0.85545,Yes


In [21]:
dataset.isna().sum()

Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
WindGustDir      6210
WindGustSpeed       0
WindDir9am       6841
WindDir3pm       2533
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Cloud9am            0
Cloud3pm            0
Temp9am             0
Temp3pm             0
RainToday         927
RainTomorrow        0
dtype: int64

In [22]:
def return_categorical_columns(df):
    catCols = []
    for col in df.columns :
        if(df[col].dtypes == 'object'):
            catCols.append(col)
    return catCols

In [23]:
print(return_categorical_columns(dataset))

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']


In [24]:
catImputer = SimpleImputer(missing_values = np.nan,strategy ='most_frequent')

for col in return_categorical_columns(dataset):
    dataset[col] = catImputer.fit_transform(dataset[col].values.reshape(-1, 1))
    testDataset[col] = catImputer.fit_transform(testDataset[col].values.reshape(-1, 1))
# Imputing the data    
 

In [25]:
dataset.isna().sum()

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [26]:
dataset["RainToday"].replace({"No":0,"Yes":1},inplace = True)
testDataset["RainToday"].replace({"No":0,"Yes":1},inplace = True)



In [27]:
dataset.isna().sum()

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [28]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset["Location"] = label_encoder.fit_transform(dataset["Location"])
testDataset["Location"] = label_encoder.fit_transform(testDataset["Location"])


In [29]:
dataset = pd.get_dummies(dataset,columns=return_categorical_columns(dataset))
testDataset = pd.get_dummies(testDataset,columns=return_categorical_columns(testDataset))



In [30]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2,0.204475,-0.051249,-0.221288,0.435522,0.796001,0.70974,0.113113,-1.450971,-1.604171,...,0,0,0,0,0,0,0,0,1,0
1,2,-0.742799,0.265637,-0.377757,0.435522,-1.157264,0.464528,-1.348046,-1.301398,-1.152919,...,0,0,0,0,0,0,0,0,0,1
2,2,0.851779,1.302718,-0.116976,0.180056,-0.791027,0.219316,0.7084,-0.902534,-1.121798,...,0,1,0,0,0,0,0,0,0,0
3,2,0.39393,0.928217,-0.325601,1.457385,0.673922,0.70974,-0.752759,-1.401114,-1.370765,...,0,0,0,0,0,0,0,1,0,0
4,2,-0.695435,0.496099,-0.377757,-0.330875,-0.913106,-0.148502,-1.131578,-1.600545,-0.717227,...,0,0,0,0,0,0,0,1,0,0


In [31]:
testDataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2,0.107664,0.345048,-0.27701,0.460943,0.567883,0.846481,-1.632082,-1.050313,-1.483403,...,0,0,0,0,0,0,0,0,0,1
1,2,-0.468892,0.668268,-0.27701,-1.21132,-0.338135,-1.102042,-1.261232,-1.730816,0.002462,...,0,0,0,0,0,0,0,0,0,0
2,2,0.32582,0.246677,-0.27701,0.764991,0.681135,0.617243,-1.049318,-1.584994,-1.18623,...,0,0,0,0,0,0,0,1,0,0
3,2,-0.390979,1.216337,-0.27701,3.045351,-0.791145,1.075719,-1.420168,-2.071068,-1.29024,...,0,1,0,0,0,0,0,0,0,0
4,2,0.575141,-0.652718,1.586107,1.601123,1.587154,1.075719,0.381101,2.011954,-3.459602,...,1,0,0,0,0,0,0,0,0,0


In [32]:
dataset.shape

(94628, 65)

In [33]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2,0.204475,-0.051249,-0.221288,0.435522,0.796001,0.70974,0.113113,-1.450971,-1.604171,...,0,0,0,0,0,0,0,0,1,0
1,2,-0.742799,0.265637,-0.377757,0.435522,-1.157264,0.464528,-1.348046,-1.301398,-1.152919,...,0,0,0,0,0,0,0,0,0,1
2,2,0.851779,1.302718,-0.116976,0.180056,-0.791027,0.219316,0.7084,-0.902534,-1.121798,...,0,1,0,0,0,0,0,0,0,0
3,2,0.39393,0.928217,-0.325601,1.457385,0.673922,0.70974,-0.752759,-1.401114,-1.370765,...,0,0,0,0,0,0,0,1,0,0
4,2,-0.695435,0.496099,-0.377757,-0.330875,-0.913106,-0.148502,-1.131578,-1.600545,-0.717227,...,0,0,0,0,0,0,0,1,0,0


In [34]:
# datasetCols = []
# testDatasetCols = []
# for col in dataset.columns:
#     datasetCols.append(col)
    

# for col in testDataset.columns:
#     testDatasetCols.append(col)

# unknownColumns = set(datasetCols)
# for col in testDatasetCols:
#     if(col not in unknownColumns):
#         testDataset = testDataset.drop(columns=[col],axis=1)


In [35]:
X = dataset.loc[:,dataset.columns!='RainTomorrow']
y = dataset.RainTomorrow

In [36]:
X.isna().sum()

Location          0
MinTemp           0
MaxTemp           0
Rainfall          0
WindGustSpeed     0
                 ..
WindDir3pm_SSW    0
WindDir3pm_SW     0
WindDir3pm_W      0
WindDir3pm_WNW    0
WindDir3pm_WSW    0
Length: 64, dtype: int64

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [38]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5,max_leaf_nodes=3) # max_depth=5,max_leaf_nodes=3
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, max_leaf_nodes=3)

In [39]:
y_pred = clf.predict(X_test)

In [40]:
print(accuracy_score(y_test,y_pred) * 100)

82.8296064589762


In [41]:
from sklearn.neighbors import KNeighborsClassifier

Ksclf = KNeighborsClassifier(n_neighbors = 5)
Ksclf.fit(X_train, y_train)

Ksclf.classes_


array([0, 1])

In [42]:
Ksclf.score(X_test,y_test)

0.837257471361542

In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rf = RandomForestRegressor(random_state = 42)
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
random_grid = {    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]}

rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 3, verbose=2,n_jobs=4)
# Construct a random forest classifier.

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   2.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   4.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   4.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   4.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   6.3s
[CV] E

In [None]:
y_pred = rf_random.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_pred) * 100)

85.29399332121571


In [None]:
from sklearn.svm import SVC

SvmClf = SVC()
SvmClf.fit(X_train, y_train)

SVC()

In [None]:
y_pred = SvmClf.predict(X_test)

In [None]:
SvmClf.score(X_test,y_test)

0.845838441053388

In [None]:
print(accuracy_score(y_test,y_pred) * 100)

84.5838441053388


In [None]:
testDataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2,0.107664,0.345048,-0.27701,0.460943,0.567883,0.846481,-1.632082,-1.050313,-1.483403,...,0,0,0,0,0,0,0,0,0,1
1,2,-0.468892,0.668268,-0.27701,-1.21132,-0.338135,-1.102042,-1.261232,-1.730816,0.002462,...,0,0,0,0,0,0,0,0,0,0
2,2,0.32582,0.246677,-0.27701,0.764991,0.681135,0.617243,-1.049318,-1.584994,-1.18623,...,0,0,0,0,0,0,0,1,0,0
3,2,-0.390979,1.216337,-0.27701,3.045351,-0.791145,1.075719,-1.420168,-2.071068,-1.29024,...,0,1,0,0,0,0,0,0,0,0
4,2,0.575141,-0.652718,1.586107,1.601123,1.587154,1.075719,0.381101,2.011954,-3.459602,...,1,0,0,0,0,0,0,0,0,0


In [None]:
testDataset.shape

(42677, 64)

In [None]:
prediction = rf_random.predict(testDataset)
predicted_values=pd.DataFrame(prediction,columns=["Predict-RainTomorrow"])

In [None]:
testDataset =pd.read_csv(TESTDATASET)
predicted_values["row ID"] = testDataset["row ID"]

In [None]:
predicted_values = predicted_values.set_index("row ID")

In [None]:
predicted_values

Unnamed: 0_level_0,Predict-RainTomorrow
row ID,Unnamed: 1_level_1
Row0,0
Row1,0
Row2,0
Row3,0
Row4,1
...,...
Row43633,0
Row43634,0
Row43635,0
Row43636,0


In [None]:
predicted_values.to_csv("./13717335.csv",index=True)

In [None]:
# from numpy import mean
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score
# # split data into 5 folds and shuffle to avoid bias
# cv = KFold(n_splits=5, random_state=1, shuffle=True)
# scores = cross_val_score(SvmClf, X, y, scoring='accuracy', cv=cv)
# average_score = mean(scores)
# print('Overall Accuracy:', average_score)
