In [1]:
import importlib
import mysklearn
importlib.reload(mysklearn)

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyNaiveBayesClassifier
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.myclassifiers import MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Introduction

For this project, we used a fully synthetic dataset from Kaggle. It contains mostly continuous data. It has 15 total attributes and 10,000 instances. We tried to classify if a crop yield was Low, Medium, or High as labels because there was no existing attribute appropriate for prediction.

(findings here)
(best performing classifier)

# Data Analysis

Data information: (details here)



## Relevant Summary Statistics

## Data Visualizations

(code cell below for function calls to display graphs?)

# Classification Results

In [None]:
import csv
from mysklearn.myclassifiers import MyRandomForestClassifier, MyKNeighborsClassifier
from mysklearn.myevaluation import train_test_split, confusion_matrix, accuracy_score



In [None]:
def load_data(filename):
    table = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)
        for row in reader:
            table.append(row)
    return header, table

def prepare_data(header, table):
    numeric_features = ['Average_Temperature_C', 'Total_Precipitation_mm', 
                       'CO2_Emissions_MT', 'Extreme_Weather_Events',
                       'Irrigation_Access_%', 'Pesticide_Use_KG_per_HA', 
                       'Fertilizer_Use_KG_per_HA', 'Soil_Health_Index']
    
    feature_indices = [header.index(feat) for feat in numeric_features]
    yield_index = header.index('Crop_Yield_MT_per_HA')
    
    X = []
    y_continuous = []
    for row in table:
        try:
            features = [float(row[i]) for i in feature_indices]
            yield_val = float(row[yield_index])
            X.append(features)
            y_continuous.append(yield_val)
        except:
            pass
    
    sorted_yields = sorted(y_continuous)
    p33_index = int(len(sorted_yields) * 0.33)
    p67_index = int(len(sorted_yields) * 0.67)
    p33 = sorted_yields[p33_index]
    p67 = sorted_yields[p67_index]
    
    y = []
    for yield_val in y_continuous:
        if yield_val < p33:
            y.append('Low')
        elif yield_val < p67:
            y.append('Medium')
        else:
            y.append('High')
    
    return X, y

def print_confusion_matrix(matrix, labels):
    print("\nConfusion Matrix:")
    print("=" * 50)
    print(f"{'':12}", end="")
    for label in labels:
        print(f"{label:>10}", end="")
    print()
    print("-" * 50)
    for i, label in enumerate(labels):
        print(f"{label:12}", end="")
        for j in range(len(labels)):
            print(f"{matrix[i][j]:>10}", end="")
        print()

In [None]:
filename = 'climate_change_impact_on_agriculture_2024.csv'
header, table = load_data(filename)
X, y = prepare_data(header, table)

print(f"Dataset: {len(X)} instances, {len(X[0])} features")
print(f"Classes: Low, Medium, High")

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f"Training: {len(X_train)} instances")
print(f"Test: {len(X_test)} instances")

In [None]:
print("=" * 60)
print("RANDOM FOREST CLASSIFIER")
print("=" * 60)

rf = MyRandomForestClassifier(n_trees=10, max_depth=5)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Accuracy: {rf_acc:.4f} ({rf_acc*100:.2f}%)")

labels = ['Low', 'Medium', 'High']
rf_matrix = confusion_matrix(y_test, rf_pred, labels)
print_confusion_matrix(rf_matrix, labels)

In [None]:
print("=" * 60)
print("K-NEAREST NEIGHBORS")
print("=" * 60)

knn = MyKNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)

print(f"Accuracy: {knn_acc:.4f} ({knn_acc*100:.2f}%)")

knn_matrix = confusion_matrix(y_test, knn_pred, labels)
print_confusion_matrix(knn_matrix, labels)

In [None]:
print("=" * 60)
print("COMPARISON")
print("=" * 60)
print(f"Random Forest: {rf_acc:.4f}")
print(f"k-NN:          {knn_acc:.4f}")

if rf_acc > knn_acc:
    print(f"\nRandom Forest wins by {(rf_acc - knn_acc)*100:.2f}%")
else:
    print(f"\nk-NN wins by {(knn_acc - rf_acc)*100:.2f}%")

# Conclusion

# Acknowledgements