In [11]:
import importlib
import mysklearn
importlib.reload(mysklearn)

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.myclassifiers import MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Introduction

For this project, we used a fully synthetic dataset from Kaggle. It contains mostly continuous data. It has 15 total attributes and 10,000 instances. We tried to classify if a crop yield was Low, Medium, or High as labels because there was no existing attribute appropriate for prediction.

(findings here)
(best performing classifier)

# Data Analysis

Our dataset is mostly continuous with 10,000 instances and 15 attributes. The "Year" attribute is an integer representing the year of recorded instance values. The attributes Country,Region,Crop_Type, and Adaptation_Strategy are all categorical strings. The attributes Average_Temperature_C, Total_Precipitation_mm, CO2_Emissions_MT, Crop_Yield_MT_per_HA, Extreme_Weather_Events, Irrigation_Access_%, Pesticide_Use_KG_per_HA, Fertilizer_Use_KG_per_HA, Soil_Health_Index, and Economic_Impact_Million_USD are all float values.



## Relevant Summary Statistics

## Data Visualizations

(code cell below for function calls to display graphs?)

# Classification Results

In [12]:
import csv
from mysklearn.myclassifiers import MyRandomForestClassifier, MyKNeighborsClassifier
from mysklearn.myevaluation import train_test_split, confusion_matrix, accuracy_score



In [13]:
def load_data(filename):
    table = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)
        for row in reader:
            table.append(row)
    return header, table

def prepare_data(header, table):
    numeric_features = ['Average_Temperature_C', 'Total_Precipitation_mm', 
                       'CO2_Emissions_MT', 'Extreme_Weather_Events',
                       'Irrigation_Access_%', 'Pesticide_Use_KG_per_HA', 
                       'Fertilizer_Use_KG_per_HA', 'Soil_Health_Index']
    
    feature_indices = [header.index(feat) for feat in numeric_features]
    yield_index = header.index('Crop_Yield_MT_per_HA')
    
    X = []
    y_continuous = []
    for row in table:
        try:
            features = [float(row[i]) for i in feature_indices]
            yield_val = float(row[yield_index])
            X.append(features)
            y_continuous.append(yield_val)
        except:
            pass
    
    sorted_yields = sorted(y_continuous)
    p33_index = int(len(sorted_yields) * 0.33)
    p67_index = int(len(sorted_yields) * 0.67)
    p33 = sorted_yields[p33_index]
    p67 = sorted_yields[p67_index]
    
    y = []
    for yield_val in y_continuous:
        if yield_val < p33:
            y.append('Low')
        elif yield_val < p67:
            y.append('Medium')
        else:
            y.append('High')
    
    return X, y

def print_confusion_matrix(matrix, labels):
    print("\nConfusion Matrix:")
    print("=" * 50)
    print(f"{'':12}", end="")
    for label in labels:
        print(f"{label:>10}", end="")
    print()
    print("-" * 50)
    for i, label in enumerate(labels):
        print(f"{label:12}", end="")
        for j in range(len(labels)):
            print(f"{matrix[i][j]:>10}", end="")
        print()

def discretize_features(X):
    """Convert continuous features to categorical bins for Naive Bayes."""
    X_discretized = []
    
    # First, find min/max for each feature to create bins
    n_features = len(X[0])
    feature_mins = [min(instance[i] for instance in X) for i in range(n_features)]
    feature_maxs = [max(instance[i] for instance in X) for i in range(n_features)]
    
    for instance in X:
        discretized_instance = []
        for i, value in enumerate(instance):
            # Create 3 equal-width bins: Low, Medium, High
            range_size = (feature_maxs[i] - feature_mins[i]) / 3
            if value < feature_mins[i] + range_size:
                discretized_instance.append('Low')
            elif value < feature_mins[i] + 2 * range_size:
                discretized_instance.append('Medium')
            else:
                discretized_instance.append('High')
        X_discretized.append(discretized_instance)
    
    return X_discretized

In [14]:
filename = 'climate_change_impact_on_agriculture_2024.csv'
header, table = load_data(filename)
X, y = prepare_data(header, table)

print(f"Dataset: {len(X)} instances, {len(X[0])} features")
print(f"Classes: Low, Medium, High")

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Training: {len(X_train)} instances")
print(f"Test: {len(X_test)} instances")

Dataset: 10000 instances, 8 features
Classes: Low, Medium, High
Training: 6700 instances
Test: 3300 instances


In [15]:
print("=" * 60)
print("RANDOM FOREST CLASSIFIER")
print("=" * 60)

rf = MyRandomForestClassifier(n_trees=10, max_depth=5)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Accuracy: {rf_acc:.4f} ({rf_acc*100:.2f}%)")

labels = ['Low', 'Medium', 'High']
rf_matrix = confusion_matrix(y_test, rf_pred, labels)
print_confusion_matrix(rf_matrix, labels)

RANDOM FOREST CLASSIFIER
Accuracy: 0.3282 (32.82%)

Confusion Matrix:
                   Low    Medium      High
--------------------------------------------------
Low                662       207       196
Medium             698       235       180
High               709       227       186


In [16]:
print("=" * 60)
print("K-NEAREST NEIGHBORS")
print("=" * 60)

knn = MyKNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)

print(f"Accuracy: {knn_acc:.4f} ({knn_acc*100:.2f}%)")

knn_matrix = confusion_matrix(y_test, knn_pred, labels)
print_confusion_matrix(knn_matrix, labels)


print("\n" + "=" * 60)
print("NAIVE BAYES CLASSIFIER")
print("=" * 60)
print("Discretizing continuous features into Low/Medium/High bins...")

# Discretize features for Naive Bayes
X_train_disc = discretize_features(X_train)
X_test_disc = discretize_features(X_test)

# Train Naive Bayes
nb = MyNaiveBayesClassifier()
nb.fit(X_train_disc, y_train)
nb_pred = nb.predict(X_test_disc)
nb_acc = accuracy_score(y_test, nb_pred)

print(f"\n‚úì Naive Bayes Accuracy: {nb_acc:.4f} ({nb_acc*100:.2f}%)")

nb_matrix = confusion_matrix(y_test, nb_pred, labels)
print_confusion_matrix(nb_matrix, labels)

# Calculate per-class accuracy
print("\nPer-Class Recognition Rates:")
for i, label in enumerate(labels):
    total = sum(nb_matrix[i])
    correct = nb_matrix[i][i]
    rate = (correct / total * 100) if total > 0 else 0
    print(f"  {label}: {correct}/{total} = {rate:.1f}%")

K-NEAREST NEIGHBORS
Accuracy: 0.3748 (37.48%)

Confusion Matrix:
                   Low    Medium      High
--------------------------------------------------
Low                349       377       339
Medium             331       411       371
High               277       368       477

NAIVE BAYES CLASSIFIER
Discretizing continuous features into Low/Medium/High bins...

‚úì Naive Bayes Accuracy: 0.4727 (47.27%)

Confusion Matrix:
                   Low    Medium      High
--------------------------------------------------
Low                550        37       478
Medium             442        33       638
High               113        32       977

Per-Class Recognition Rates:
  Low: 550/1065 = 51.6%
  Medium: 33/1113 = 3.0%
  High: 977/1122 = 87.1%


In [17]:
print("\n" + "=" * 70)
print("FINAL COMPARISON")
print("=" * 70)

print(f"\nOverall Accuracy:")
print(f"  Random Forest:  {rf_acc:.4f} ({rf_acc*100:.2f}%)")
print(f"  k-NN (k=5):     {knn_acc:.4f} ({knn_acc*100:.2f}%)")
print(f"  Naive Bayes:    {nb_acc:.4f} ({nb_acc*100:.2f}%)")

# Find winner
accuracies = [('Random Forest', rf_acc), ('k-NN', knn_acc), ('Naive Bayes', nb_acc)]
accuracies_sorted = sorted(accuracies, key=lambda x: x[1], reverse=True)

print(f"\nüèÜ Rankings:")
for i, (name, acc) in enumerate(accuracies_sorted, 1):
    print(f"  {i}. {name}: {acc:.4f} ({acc*100:.2f}%)")

winner = accuracies_sorted[0]
print(f"\nüèÜ Winner: {winner[0]} with {winner[1]*100:.2f}% accuracy!")

# Show differences
print(f"\nPerformance Gaps:")
print(f"  1st vs 2nd: {(accuracies_sorted[0][1] - accuracies_sorted[1][1])*100:.2f} percentage points")
print(f"  1st vs 3rd: {(accuracies_sorted[0][1] - accuracies_sorted[2][1])*100:.2f} percentage points")

print("\n" + "=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)


FINAL COMPARISON

Overall Accuracy:
  Random Forest:  0.3282 (32.82%)
  k-NN (k=5):     0.3748 (37.48%)
  Naive Bayes:    0.4727 (47.27%)

üèÜ Rankings:
  1. Naive Bayes: 0.4727 (47.27%)
  2. k-NN: 0.3748 (37.48%)
  3. Random Forest: 0.3282 (32.82%)

üèÜ Winner: Naive Bayes with 47.27% accuracy!

Performance Gaps:
  1st vs 2nd: 9.79 percentage points
  1st vs 3rd: 14.45 percentage points

ANALYSIS COMPLETE


# Conclusion

# Acknowledgements

Claude AI was used for assistance in this project for helping understanding and developing Random Forest and its unit tests, and correcting bugs in our EDA code. Our Naive Bayes and kNN classifier code was taken from previous PAs.