In [1]:
# Import necessary libiaries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Define download function
import requests

def download(url, filename):
  response = requests.get(url)
  with open(filename, 'wb') as file:
    file.write(response.content)

In [3]:
# Download the target file
URL = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'
download(URL, 'Weather_Data.csv')

In [4]:
# Show the data
df = pd.read_csv("Weather_Data.csv")
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [5]:
######### Data argumentation

# get_dummies() convert categorical feature to one hot code columns.
# For example, wind feature has 4 categories, pass the function it
# will become 4 columns instead of 1 column.
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
df_sydney_processed.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,41,17,20,92,...,False,False,False,False,False,True,False,False,False,False
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,41,9,13,83,...,False,False,False,False,False,False,False,False,False,False
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,41,17,2,88,...,False,False,False,False,False,False,False,False,False,False
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,41,22,20,83,...,False,False,False,False,False,False,False,False,False,False
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,41,11,6,88,...,False,False,False,False,False,False,False,True,False,False


In [6]:
# Raintoday feature in the dataset will be converted from 'Yes', 'No' to '1', '0'.
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)
df_sydney_processed.head()

  df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)


Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,41,17,20,92,...,False,False,False,False,False,True,False,False,False,False
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,41,9,13,83,...,False,False,False,False,False,False,False,False,False,False
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,41,17,2,88,...,False,False,False,False,False,False,False,False,False,False
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,41,22,20,83,...,False,False,False,False,False,False,False,False,False,False
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,41,11,6,88,...,False,False,False,False,False,False,False,True,False,False


In [7]:
# Drop the unnecessary column 'Data'
df_sydney_processed.drop('Date',axis=1,inplace=True)
# assign the float data type
df_sydney_processed = df_sydney_processed.astype(float)
df_sydney_processed.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,19.5,22.4,15.6,6.2,0.0,41.0,17.0,20.0,92.0,84.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,19.5,25.6,6.0,3.4,2.7,41.0,9.0,13.0,83.0,73.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21.6,24.5,6.6,2.4,0.1,41.0,17.0,2.0,88.0,86.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.2,22.8,18.8,2.2,0.0,41.0,22.0,20.0,83.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.7,25.7,77.4,4.8,0.0,41.0,11.0,6.0,88.0,74.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
# Get input and output
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

In [9]:
####### Performing linear regression

# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 10)

In [10]:
# Using linear model from sklearn
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)
print('Linear Regression coefficients:', LinearReg.coef_)

Linear Regression coefficients: [-0.02369173  0.01300554  0.00072981  0.00649077 -0.03516427  0.00423762
  0.0018292   0.00078986  0.00095609  0.00856061  0.00769793 -0.00924424
 -0.00887454  0.01004774  0.01446555 -0.00348065 -0.05402493  0.05402493
  0.05039419 -0.07898527  0.06640003 -0.0721012  -0.05945626 -0.08239011
 -0.0789619   0.06418738 -0.00838878  0.11105128  0.01414852  0.03851666
  0.03625722 -0.02133122  0.00395909  0.01670037  0.04350405  0.05317842
 -0.00692976 -0.01911823 -0.01461142 -0.00594829 -0.07546046  0.04176858
 -0.00758587 -0.00980346 -0.01874997  0.00302978  0.01914623 -0.0012425
 -0.01756641  0.01638932 -0.09330032 -0.08339081 -0.01838672 -0.05191842
 -0.04092463  0.03423083  0.06883841  0.01862747  0.06892422  0.00033817
 -0.04820507  0.0755034   0.03967488  0.02636872 -0.02236214  0.02598199]


In [11]:
# Predict if it will rain tomorrow
predictions = LinearReg.predict(x_test)
LinearRegression_MAE = metrics.mean_absolute_error(y_test, predictions)
LinearRegression_MSE = metrics.mean_squared_error(y_test, predictions)
LinearRegression_R2 = metrics.r2_score(y_test, predictions)

In [12]:
# Create a dictionary
errors_linearReg = {
    'Model': ['Linear Reg'],
    'Linear Reg MAE': [LinearRegression_MAE],
    'Linear Reg MSE': [LinearRegression_MSE],
    'Linear R2': [LinearRegression_R2]
}
# Create a panda data frame
Report_LinearReg = pd.DataFrame(errors_linearReg)
print(Report_LinearReg)

        Model  Linear Reg MAE  Linear Reg MSE  Linear R2
0  Linear Reg        0.256318        0.115721   0.427132


In [13]:
####### Performing KNN

######### Normalize the data
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_norm = scaler.transform(x_train)
x_test_norm = scaler.transform(x_test)

# Let's start with k = 4
k = 4
KNN = KNeighborsClassifier(n_neighbors = k).fit(x_train_norm, y_train)
KNN

In [14]:
# Get the predictions
predictions = KNN.predict(x_test_norm)
KNN_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
KNN_JaccardIndex = metrics.jaccard_score(y_test, predictions, average='macro')  # or average='micro' depending on your problem
KNN_F1_Score = metrics.f1_score(y_test, predictions, average='macro')  # or average='micro' or 'weighted' depending on your problem

In [15]:
# Create a dictionary
errors_KNN = {
    'Model': ['KNN'],
    'Accuracy': [KNN_Accuracy_Score],
    'Jaccard': [KNN_JaccardIndex],
    'F1 Score': [KNN_F1_Score]
}

# Create a panda data frame
Report_KNN = pd.DataFrame(errors_KNN)
print(Report_KNN)

  Model  Accuracy   Jaccard  F1 Score
0   KNN  0.760305  0.489394  0.617749


In [16]:
####### Decision Tree
Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree.fit(x_train, y_train)
predictions = Tree.predict(x_test)

In [17]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)

In [18]:
# Create a dictionary
errors_Tree = {
    'Model': ['Tree'],
    'Accuracy': [Tree_Accuracy_Score],
    'Jaccard': [Tree_JaccardIndex],
    'F1 Score': [Tree_F1_Score]
}

# Create a panda data frame
Report_Tree = pd.DataFrame(errors_Tree)
print(Report_Tree)

  Model  Accuracy   Jaccard  F1 Score
0  Tree  0.818321  0.480349  0.648968


In [19]:
####### Performing linear regression

# Split the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 1)

In [20]:
######## Logistic Regression
LR = LogisticRegression(solver = 'liblinear').fit(x_train, y_train)

In [21]:
predictions = LR.predict(x_test)
predict_proba = LR.predict_proba(x_test)

In [22]:
LR_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
LR_JaccardIndex = metrics.jaccard_score(y_test, predictions)
LR_F1_Score = metrics.f1_score(y_test, predictions)
LR_Log_Loss = metrics.log_loss(y_test, predict_proba)

In [23]:
# Create and train the SVM model
SVM = svm.SVC(probability=True)  # Enable probability estimates with probability=True
SVM.fit(x_train, y_train)

# Use the predict method on the testing data
predictions = SVM.predict(x_test)

# Calculate the evaluation metrics
SVM_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
SVM_JaccardIndex = metrics.jaccard_score(y_test, predictions)
SVM_F1_Score = metrics.f1_score(y_test, predictions)

In [24]:
errors_SVM = {
    'Model': ['SVM'],
    'Accuracy': [Tree_Accuracy_Score],
    'Jaccard': [Tree_JaccardIndex],
    'F1 Score': [Tree_F1_Score]
}

# Create a panda data frame
Report_SVM = pd.DataFrame(errors_SVM)
print(Report_SVM)

  Model  Accuracy   Jaccard  F1 Score
0   SVM  0.818321  0.480349  0.648968


In [25]:
result = pd.concat([Report_LinearReg, Report_KNN, Report_Tree, Report_SVM], axis = 0)

In [26]:
print(result)

        Model  Linear Reg MAE  Linear Reg MSE  Linear R2  Accuracy   Jaccard  \
0  Linear Reg        0.256318        0.115721   0.427132       NaN       NaN   
0         KNN             NaN             NaN        NaN  0.760305  0.489394   
0        Tree             NaN             NaN        NaN  0.818321  0.480349   
0         SVM             NaN             NaN        NaN  0.818321  0.480349   

   F1 Score  
0       NaN  
0  0.617749  
0  0.648968  
0  0.648968  
