In [22]:
#This imports the Pandas library, commonly used for data manipulation and analysis in Python.
import pandas as pd

In [23]:
#Reads the weather dataset from the CSV file named "weatherHistory.csv" into a DataFrame named df
df = pd.read_csv("seattle-weather.csv")
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,drizzle
1,1/2/2012,10.9,10.6,2.8,4.5,rain
2,1/3/2012,0.8,11.7,7.2,2.3,rain
3,1/4/2012,20.3,12.2,5.6,4.7,rain
4,1/5/2012,1.3,8.9,2.8,6.1,rain


In [24]:
#Calculates the number of missing values in each column.
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [25]:

def LabelEncoding(c):
    from sklearn import preprocessing          # Import the preprocessing module from scikit-learn
    le = preprocessing.LabelEncoder()          # Create an instance of the LabelEncoder
    df[c] = le.fit_transform(df[c])            # Fit the encoder on the column 'c' of the DataFrame 'df' and transform it
    df[c].unique()                             # Get the unique encoded values (note: this line has no effect unless printed or returned)

# Apply label encoding to the 'weather' column
LabelEncoding("weather")

# Display the DataFrame after encoding
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,0
1,1/2/2012,10.9,10.6,2.8,4.5,2
2,1/3/2012,0.8,11.7,7.2,2.3,2
3,1/4/2012,20.3,12.2,5.6,4.7,2
4,1/5/2012,1.3,8.9,2.8,6.1,2
...,...,...,...,...,...,...
1456,12/27/2015,8.6,4.4,1.7,2.9,2
1457,12/28/2015,1.5,5.0,1.7,1.3,2
1458,12/29/2015,0.0,7.2,0.6,2.6,1
1459,12/30/2015,0.0,5.6,-1.0,3.4,4


In [26]:
# Define a list of column names to be used, likely for analysis or modeling
cols = ['precipitation' , 'temp_max', 'temp_min', 'wind']

In [27]:
# Define a function to normalize specific columns of a DataFrame
def normalize(df, cols):
    for x in cols:
        # Normalize each column by dividing all values by the maximum value in that column
        df[x] = df[x] / df[x].max()

# Call the normalization function on the DataFrame 'df' and the selected columns
normalize(df, cols)

# Display the normalized DataFrame
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.000000,0.359551,0.273224,0.494737,0
1,1/2/2012,0.194991,0.297753,0.153005,0.473684,2
2,1/3/2012,0.014311,0.328652,0.393443,0.242105,2
3,1/4/2012,0.363148,0.342697,0.306011,0.494737,2
4,1/5/2012,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...,...
1456,12/27/2015,0.153846,0.123596,0.092896,0.305263,2
1457,12/28/2015,0.026834,0.140449,0.092896,0.136842,2
1458,12/29/2015,0.000000,0.202247,0.032787,0.273684,1
1459,12/30/2015,0.000000,0.157303,-0.054645,0.357895,4


In [28]:
#delete date from the dataframe
df = df.drop('date',axis=1)
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.000000,0.359551,0.273224,0.494737,0
1,0.194991,0.297753,0.153005,0.473684,2
2,0.014311,0.328652,0.393443,0.242105,2
3,0.363148,0.342697,0.306011,0.494737,2
4,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...
1456,0.153846,0.123596,0.092896,0.305263,2
1457,0.026834,0.140449,0.092896,0.136842,2
1458,0.000000,0.202247,0.032787,0.273684,1
1459,0.000000,0.157303,-0.054645,0.357895,4


In [29]:
# Separate the features (independent variables) by dropping the 'weather' column
x = df.drop('weather', axis=1)

# Extract the target variable (dependent variable) — the 'weather' column
y = df['weather']

In [30]:
from sklearn.model_selection import train_test_split
#This Python code splits the dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [31]:
#!pip install xgboost

In [32]:
from xgboost import XGBClassifier

In [33]:
# Create an instance of the XGBClassifier model named 'xg'.
# Train (fit) the model on the training data (X_train) and training labels (y_train).

xg = XGBClassifier()
xg.fit(X_train, y_train)

In [34]:
# Retrieve the parameters and hyperparameters of the trained XGBClassifier model 'xg'.
# This helps to inspect the current configuration of the model.

xg.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'feature_weights': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [35]:
# Import functions to evaluate classification models:
# - classification_report: provides detailed metrics like precision, recall, F1-score for each class.
# - accuracy_score: calculates the overall accuracy of the model's predictions.

from sklearn.metrics import classification_report, accuracy_score

In [36]:

y_hat = xg.predict(X_test)  # Predict the labels for the test set using the trained model
print(accuracy_score(y_test, y_hat))  # Print the accuracy of the predictions compared to true labels
print(classification_report(y_test, y_hat))  # Print detailed classification metrics for the predictions


0.757679180887372
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.15      0.07      0.10        29
           2       0.95      0.91      0.93       123
           3       1.00      0.33      0.50         6
           4       0.70      0.85      0.77       125

    accuracy                           0.76       293
   macro avg       0.56      0.43      0.46       293
weighted avg       0.73      0.76      0.74       293



In [37]:

grid = {
    'learning_rate': [0.1, 1, 0.01, 0.001],  # Different learning rates to test
    'gamma': [0, 1, 10, 100]                 # Different gamma (minimum loss reduction) values to test
}

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
model = GridSearchCV(
    XGBClassifier(),      # The base model to tune: XGBClassifier
    grid,                # The hyperparameter grid to search over
    cv=10,               # Use 10-fold cross-validation for evaluating each parameter combination
    verbose=2            # Print detailed messages during the search process
)

In [40]:
# Train model
model.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.5s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.2s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.1s
[CV] END ...........................gamma=0, learning_rate=1; total time=   0.1s
[CV] END ...........................gamma=0, l

In [41]:
grid_predictions = model.predict(X_test)  # Predict test set labels using the best estimator found by GridSearchCV
print(accuracy_score(y_test, grid_predictions))  # Print accuracy of the grid search model's predictions
print(classification_report(y_test, grid_predictions))  # Print detailed classification metrics for the predictions


0.8088737201365188
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.25      0.03      0.06        29
           2       0.97      0.90      0.93       123
           3       1.00      0.33      0.50         6
           4       0.72      0.98      0.83       125

    accuracy                           0.81       293
   macro avg       0.59      0.45      0.46       293
weighted avg       0.76      0.81      0.76       293



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
print(model.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)
