In [1]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import scipy
import seaborn as sns
from scipy import stats
from scipy.stats import pearsonr, ttest_ind
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data=pd.read_csv("dataset.csv")

In [3]:
print(data.shape)

(1461, 6)


In [4]:
df_date = pd.read_csv("dataset.csv")

# Display the first five rows of the DataFrame
print(data.head())

         date  precipitation  temp_max  temp_min  wind  weather
0  2012-01-01            0.0      12.8       5.0   4.7  drizzle
1  2012-01-02           10.9      10.6       2.8   4.5     rain
2  2012-01-03            0.8      11.7       7.2   2.3     rain
3  2012-01-04           20.3      12.2       5.6   4.7     rain
4  2012-01-05            1.3       8.9       2.8   6.1     rain


In [5]:
df_date.date = pd.to_datetime(df_date.date).dt.month
df_date.date

0        1
1        1
2        1
3        1
4        1
        ..
1456    12
1457    12
1458    12
1459    12
1460    12
Name: date, Length: 1461, dtype: int32

In [6]:
df_date = df_date.rename(columns = {'date':'month'})
df_date.head()

Unnamed: 0,month,precipitation,temp_max,temp_min,wind,weather
0,1,0.0,12.8,5.0,4.7,drizzle
1,1,10.9,10.6,2.8,4.5,rain
2,1,0.8,11.7,7.2,2.3,rain
3,1,20.3,12.2,5.6,4.7,rain
4,1,1.3,8.9,2.8,6.1,rain


In [7]:
df_date.precipitation=np.sqrt(df_date.precipitation)
df_date.wind=np.sqrt(df_date.wind)

In [8]:
# Create a LabelEncoder object
lc_date = LabelEncoder()

# Encode the "weather" column of the DataFrame and replace it with the encoded values
df_date["weather"] = lc_date.fit_transform(df_date["weather"])

# Display the first few rows of the DataFrame to confirm the encoding
df_date.head()

Unnamed: 0,month,precipitation,temp_max,temp_min,wind,weather
0,1,0.0,12.8,5.0,2.167948,0
1,1,3.301515,10.6,2.8,2.12132,2
2,1,0.894427,11.7,7.2,1.516575,2
3,1,4.505552,12.2,5.6,2.167948,2
4,1,1.140175,8.9,2.8,2.469818,2


In [9]:
# Extract the feature and target variables from the DataFrame
# Convert the features to integers and exclude the "weather" column
x_date = df_date.loc[:, df_date.columns != "weather"].astype(int).values

# Get the target variable as an array of values
y_date = df_date["weather"].values

In [10]:
x_train_date, x_test_date, y_train_date, y_test_date = train_test_split(x_date, y_date, test_size=0.1, random_state=2)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
# Create a KNeighborsClassifier object
knn_date = KNeighborsClassifier()

# Fit the model to the training data
knn_date.fit(x_train_date, y_train_date)

# Compute the accuracy score on the test data
knn_date_score = knn_date.score(x_test_date, y_test_date)

# Print the accuracy score
print("KNN Accuracy (with month column):", knn_date_score)

KNN Accuracy (with month column): 0.7755102040816326


In [12]:
y_pred_knn_date = knn_date.predict(x_test_date)

# Compute the confusion matrix for the KNN model predictions
conf_matrix_knn_date = confusion_matrix(y_test_date, y_pred_knn_date)

# Print the confusion matrix to the console
print("Confusion Matrix (with month column)")
print(conf_matrix_knn_date)

Confusion Matrix (with month column)
[[ 0  1  0  0  4]
 [ 1  0  0  0  4]
 [ 1  0 67  0 11]
 [ 0  0  4  2  0]
 [ 1  2  4  0 45]]


In [13]:
from sklearn.tree import DecisionTreeClassifier

# Import the DecisionTreeClassifier model from sklearn.tree
# Create a list of values for the "max_depth" parameter to test
max_depth_range_date = list(range(1, 8))

# Loop through each value of "max_depth" in the list
for depth in max_depth_range_date:
  # Create a DecisionTreeClassifier model with the current value of "max_depth",
  # a fixed "max_leaf_nodes" value of 15, and a fixed "random_state" value of 0
  dec_date = DecisionTreeClassifier(max_depth=depth, max_leaf_nodes=15, random_state=0)
  
  # Fit the model to the training data
  dec_date.fit(x_train_date, y_train_date)
  
  # Evaluate the model's accuracy on the test data
  dec_date_score = dec_date.score(x_test_date, y_test_date)
  
  # Print the accuracy score to the console, along with the current value of "max_depth"
  print("Decision Tree Accuracy (with month column) for max_depth=", depth, ": ", dec_date_score)

Decision Tree Accuracy (with month column) for max_depth= 1 :  0.782312925170068
Decision Tree Accuracy (with month column) for max_depth= 2 :  0.7959183673469388
Decision Tree Accuracy (with month column) for max_depth= 3 :  0.8095238095238095
Decision Tree Accuracy (with month column) for max_depth= 4 :  0.8095238095238095
Decision Tree Accuracy (with month column) for max_depth= 5 :  0.8027210884353742
Decision Tree Accuracy (with month column) for max_depth= 6 :  0.8027210884353742
Decision Tree Accuracy (with month column) for max_depth= 7 :  0.7959183673469388


In [14]:
from sklearn.metrics import confusion_matrix

# Use the Decision Tree model to predict the target variable for the test set
y_pred_dec_date = dec_date.predict(x_test_date)

# Compute the confusion matrix for the Decision Tree model predictions
conf_matrix_dec_date = confusion_matrix(y_test_date, y_pred_dec_date)

# Print the confusion matrix to the console
print("Confusion Matrix (with month column)")
print(conf_matrix_dec_date)

Confusion Matrix (with month column)
[[ 0  0  0  0  5]
 [ 0  0  0  0  5]
 [ 0  1 63  1 14]
 [ 0  0  1  4  1]
 [ 0  1  1  0 50]]


In [15]:
from sklearn.linear_model import LogisticRegression

# Create a new logistic regression model for the "x_date" and "y_date" datasets
lg_date = LogisticRegression()

# Fit the logistic regression model to the training data
lg_date.fit(x_train_date, y_train_date)

# Use the logistic regression model to predict the target variable for the test set
lg_date_score = lg_date.score(x_test_date, y_test_date)

# Print the accuracy score of the logistic regression model to the console
print("Logistic Accuracy (with month column): ", lg_date_score)

Logistic Accuracy (with month column):  0.8027210884353742


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from sklearn.metrics import confusion_matrix

# Use the logistic regression model to predict the target variable for the test set
y_pred_lg_date = lg_date.predict(x_test_date)

# Compute the confusion matrix for the logistic regression model predictions
conf_matrix_date = confusion_matrix(y_test_date, y_pred_lg_date)

# Print the confusion matrix to the console
print("Confusion Matrix (with month column)")
print(conf_matrix_date)

Confusion Matrix (with month column)
[[ 0  0  0  0  5]
 [ 0  0  0  0  5]
 [ 0  0 64  0 15]
 [ 0  0  3  2  1]
 [ 0  0  0  0 52]]


In [17]:
df3 = pd.read_csv("dataset.csv")


In [30]:
import pandas as pd

# Assuming df3 is your DataFrame
numeric_columns = df3.select_dtypes(include=np.number).columns

# Calculate quartiles and IQR for numeric columns only
Q1 = df3[numeric_columns].quantile(0.25)
Q3 = df3[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

# Use the IQR to filter out outliers in numeric columns
outlier_mask = ~((df3[numeric_columns] < (Q1 - 1.5 * IQR)) | (df3[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)

# Apply the mask to the original DataFrame
df3 = df3[outlier_mask]


In [19]:
df3.precipitation=np.sqrt(df3.precipitation)
df3.wind=np.sqrt(df3.wind)

In [31]:
lc = LabelEncoder()
df3["weather"]=lc.fit_transform(df3["weather"])
df3.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,2.167948,0
2,2012-01-03,0.894427,11.7,7.2,1.516575,2
4,2012-01-05,1.140175,8.9,2.8,2.469818,2
5,2012-01-06,1.581139,4.4,2.2,1.48324,2
6,2012-01-07,0.0,7.2,2.8,1.516575,2


In [36]:
df3.date = pd.to_datetime(df3.date)
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1092 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1092 non-null   datetime64[ns]
 1   precipitation  1092 non-null   float64       
 2   temp_max       1092 non-null   float64       
 3   temp_min       1092 non-null   float64       
 4   wind           1092 non-null   float64       
 5   weather        1092 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 59.7 KB


In [41]:
x_df3 = ((df3.loc[:,df3.columns!="weather"]).astype(np.int64)).values[:,0:]
y_df3 = df3["weather"].values

In [42]:
x_train_df3,x_test_df3,y_train_df3,y_test_df3 = train_test_split(x_df3,y_df3,test_size=0.1,random_state=2)

In [43]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate a KNN model
knn_df3 = KNeighborsClassifier()

# Train the KNN model using the training data
knn_df3.fit(x_train_df3, y_train_df3)

# Evaluate the accuracy of the KNN model on the test data
knn_score_df3 = knn_df3.score(x_test_df3, y_test_df3)

# Print the KNN model accuracy to the console
print("KNN Accuracy:", knn_score_df3)

KNN Accuracy: 0.5909090909090909


In [44]:
from sklearn.tree import DecisionTreeClassifier

# Create a list of max depth values to try
max_depth_range = list(range(1, 8))

# Train and evaluate a decision tree model with varying max depth values
for depth in max_depth_range:
    
    # Create a decision tree classifier with the current max depth value and other parameters
    dec_df3 = DecisionTreeClassifier(max_depth=depth, max_leaf_nodes=15, random_state=0)
    
    # Train the decision tree model on the training data
    dec_df3.fit(x_train_df3, y_train_df3)
    
    # Compute the accuracy of the decision tree model on the testing data
    dec_score_df3 = dec_df3.score(x_test_df3, y_test_df3)
    
    # Print the accuracy score to the console
    print("Decision Tree Accuracy: ", dec_score_df3)

Decision Tree Accuracy:  0.6818181818181818
Decision Tree Accuracy:  0.6818181818181818
Decision Tree Accuracy:  0.6909090909090909
Decision Tree Accuracy:  0.7090909090909091
Decision Tree Accuracy:  0.7363636363636363
Decision Tree Accuracy:  0.7363636363636363
Decision Tree Accuracy:  0.7545454545454545


In [45]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model object
lg_df3 = LogisticRegression()

# Train the logistic regression model on the training data
lg_df3.fit(x_train_df3, y_train_df3)

# Evaluate the logistic regression model on the test data
# by computing the accuracy score
lg_score_df3 = lg_df3.score(x_test_df3, y_test_df3)

# Print the accuracy score to the console
print("Logistic Accuracy : ", lg_score_df3)

Logistic Accuracy :  0.02727272727272727


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
dec_df3 = DecisionTreeClassifier(max_depth=4, max_leaf_nodes=15, random_state=0)

# Train the decision tree model on the training data
dec_df3.fit(x_train_df3, y_train_df3)

# Compute the accuracy of the decision tree model on the testing data
dec_score_df3 = dec_df3.score(x_test_df3, y_test_df3)

# Print the accuracy score to the console
print("Decision Tree Accuracy: ", dec_score_df3)

Decision Tree Accuracy:  0.7090909090909091


In [47]:
for i in (range(len(y_test_df3))):
    print("----------------------------------")
    ot = dec_df3.predict([x_test_df3[i]])
    if(ot==0):
        print("The weather predict is: Drizzle")
    elif(ot==1):
        print("The weather predict is: Fog")
    elif(ot==2):
        print("The weather predict is: Rain")
    elif(ot==3):
        print("The weather predict is: Snow")
    else:
        print("The weather predict is: Sun")
    ac = y_test_df3[i]
    if(ac==0):
        print("The weather actual is: Drizzle")
    elif(ac==1):
        print("The weather actual is: Fog")
    elif(ac==2):
        print("The weather actual is: Rain")
    elif(ac==3):
        print("The weather actual is: Snow")
    else:
        print("The weather actual is: Sun")

----------------------------------
The weather predict is: Sun
The weather actual is: Fog
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Sun
The weather actual is: Fog
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Rain
The weather actual is: Rain
----------------------------------
The weather predict is: Rain
The weather actual is: Rain
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
----------------------------------
The weather predict is: Sun
The weather actual is: Sun
------

In [49]:
input=[[1.14,8.9,2.8,2.46,2]]
ot = dec_df3.predict(input)
print("The weather is:")
if(ot==0):
    print("Drizzle")
elif(ot==1):
    print("Fog")
elif(ot==2):
    print("Rain")
elif(ot==3):
    print("Snow")
else:
    print("Sun")

The weather is:
Rain
