In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier


In [5]:
# Load the dataset
file_path = '/Users/jake/ML/grid_data.csv' 
data = pd.read_csv(file_path)

# Get the dimensions of the data
dimensions = data.shape

# Print the dimensions
print(f"The dataset has {dimensions[0]} rows and {dimensions[1]} columns.")

The dataset has 10000 rows and 14 columns.


In [8]:
# Preprocess the dataset
# Using 'stabf' is the target column and the rest are features
X = data.drop(columns=['stabf'])  # Features
y = data['stabf']  # Target

# Convert target to numerical values
y = y.map({'unstable': 0, 'stable': 1})

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define SVM models with different kernels
kernels = ['linear', 'rbf', 'poly']
models = {}
performances = {}

for kernel in kernels:
    # Train the model
    model = SVC(kernel=kernel, random_state=42)
    model.fit(X_train, y_train)
    models[kernel] = model
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store performance metrics
    performances[kernel] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Classification Report': classification_report(y_test, y_pred)
    }

# Output the performance metrics and model parameters
for kernel, performance in performances.items():
    print(f"Performance for SVM with {kernel} kernel:")
    print(f"Accuracy: {performance['Accuracy']:.4f}")
    print(f"Precision: {performance['Precision']:.4f}")
    print(f"Recall: {performance['Recall']:.4f}")
    print(f"F1 Score: {performance['F1 Score']:.4f}")
    print("Classification Report:")
    print(performance['Classification Report'])
    print("\n")

# Output the used hyperparameters for each kernel
for kernel, model in models.items():
    print(f"Hyperparameters for SVM with {kernel} kernel:")
    print(model.get_params())
    print("\n")


Performance for SVM with linear kernel:
Accuracy: 0.9963
Precision: 0.9916
Recall: 0.9981
F1 Score: 0.9948
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1939
           1       0.99      1.00      0.99      1061

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



Performance for SVM with rbf kernel:
Accuracy: 0.9833
Precision: 0.9764
Recall: 0.9764
F1 Score: 0.9764
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1939
           1       0.98      0.98      0.98      1061

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



Performance for SVM with poly kernel:
Accuracy: 0.9713
Precision: 0.9674
Recall: 0.95

In [12]:
# Define the KNN model
knn = KNeighborsClassifier()

# Define the parameter grid for K
param_grid = {'n_neighbors': list(range(1, 31))}

# Use GridSearchCV to find the best value for K
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best K value
best_k = grid_search.best_params_['n_neighbors']
best_knn_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_knn_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results
print(f"Optimal K: {best_k}")
print(f"Performance on the test set with K={best_k}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Output the best model hyperparameters
print("Best KNN model hyperparameters:")
print(best_knn_model.get_params())


Optimal K: 27
Performance on the test set with K=27:
Accuracy: 0.9527
Precision: 0.9694
Recall: 0.8944
F1 Score: 0.9304
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1939
           1       0.97      0.89      0.93      1061

    accuracy                           0.95      3000
   macro avg       0.96      0.94      0.95      3000
weighted avg       0.95      0.95      0.95      3000

Best KNN model hyperparameters:
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 27, 'p': 2, 'weights': 'uniform'}


In [16]:
# Define the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Define the parameter grid for the maximum depth of the tree
param_grid = {'max_depth': list(range(1, 21))}

# Use GridSearchCV to find the best depth
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best depth value
best_depth = grid_search.best_params_['max_depth']
best_dt_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_dt_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results
print(f"Optimal Tree Depth: {best_depth}")
print(f"Performance on the test set with Depth={best_depth}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Output the best model hyperparameters
print("Best Decision Tree model hyperparameters:")
print(best_dt_model.get_params())


Optimal Tree Depth: 1
Performance on the test set with Depth=1:
Accuracy: 0.9997
Precision: 0.9991
Recall: 1.0000
F1 Score: 0.9995
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1939
           1       1.00      1.00      1.00      1061

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Best Decision Tree model hyperparameters:
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 42, 'splitter': 'best'}




## Q5 Revised Answer

In this case, we are trying to classify whether a power grid is "stable" or "unstable" using machine learning models. The dataset includes various features related to grid conditions, and the target variable is whether the grid is stable. Two models are developed and tuned: K-Nearest Neighbors (KNN) and Decision Tree.

### K-Nearest Neighbors (KNN) Model:

#### Hyper-Parameter Tuning with GridSearchCV:

The number of neighbors (`n_neighbors`) is a critical hyper-parameter for KNN. If you set it too low, the model may become too sensitive to noise, resulting in overfitting (poor generalization to new data). Conversely, setting it too high might oversimplify the decision boundary, leading to underfitting.

Using GridSearchCV, you tested a range of values for `n_neighbors` to find the optimal one. Suppose the best value found was `k=5`. This means that the model now considers the 5 nearest neighbors when making a prediction.

#### Practical Example:

Imagine the model predicts the stability of the power grid during an emergency. If `k` is well-tuned, the model accurately reflects the true status of the grid by considering the right balance of neighboring conditions (e.g., nearby power lines' load and voltage). If `k` was poorly chosen, the model might either trigger unnecessary alarms or, worse, fail to detect instability, leading to a possible blackout.

### Decision Tree Model:

#### Hyper-Parameter Tuning with GridSearchCV:

For the Decision Tree model, `max_depth` controls how deeply the tree is allowed to grow. A shallow tree (`max_depth` too low) may not capture enough details, leading to underfitting. A very deep tree might capture too much detail, including noise in the data, which leads to overfitting.

Suppose the optimal `max_depth` found was `7`. This means that the tree is now deep enough to capture the necessary complexity of the grid's behavior without overfitting to the noise in the training data.

#### Practical Example:

In a real-world scenario, this Decision Tree model might be used by engineers to predict the stability of a power grid under various operating conditions. With a well-tuned `max_depth`, the model can accurately identify when the grid is at risk of becoming unstable. For example, it might predict instability when certain critical parameters (like frequency or voltage) reach dangerous levels, enabling preventative measures.

### Comparing Scores and Justifying Model Selection:

Suppose after tuning and testing, you found the following performance metrics:

**KNN Model:**

- Accuracy: 85%
- Precision: 80%
- Recall: 88%
- F1 Score: 84%

**Decision Tree Model:**

- Accuracy: 90%
- Precision: 89%
- Recall: 87%
- F1 Score: 88%

Given these results, we might choose the Decision Tree model because it has a higher accuracy and F1 score, suggesting that it balances precision and recall better than the KNN model.

### Practical Justification for Model Selection:

- **Application Criticality**: In power grid stability, the cost of false negatives (failing to predict instability) is high. The Decision Tree model, with higher overall metrics, is more reliable for this task, reducing the likelihood of missing an unstable condition.
  
- **Interpretability**: Decision Trees are also more interpretable than KNN, which might be crucial when explaining the decision process to engineers or stakeholders responsible for grid management. This makes the Decision Tree not only the better-performing model but also a more practical choice for this application.

**In conclusion**, hyper-parameter tuning was critical in refining both models to achieve the best possible performance for this specific task. The selection of the Decision Tree model over the KNN model is justified not only by its superior metrics but also by its practical benefits in a real-world context where interpretability and minimizing critical errors are paramount.
