# Spruce tree type detection

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Tree_Type', data=data)
plt.title('Class Distribution')
plt.show()

## 2. Data Pre-processing
### Encode categorical variable 'Tree_Type'

In [None]:
label_encoder = LabelEncoder()
data['Tree_Type'] = label_encoder.fit_transform(data['Tree_Type'])

print(data['Tree_Type'].head(42))

### Replace Soil_Type1 to Soil_Type38 with a single column Soil_Type

In [None]:
soil_type_cols = [f'Soil_Type{i}' for i in range(1, 39)]
data['Soil_Type'] = data[soil_type_cols].idxmax(axis=1).str.extract(r'(\d+)').astype(int)
data.drop(soil_type_cols, axis=1, inplace=True)

### Split the dataset into training and testing sets


In [None]:
X = data.drop('Tree_Type', axis=1)
y = data['Tree_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Standardize/normalize data if necessary


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. Identification of Target Concept
### Target variable: 'Tree_Type'

## 4. Selection and Parameterization of Learning Algorithms
### Initialize models


In [None]:
models = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    SVC(),
]

## 5. Training and Evaluation
### Train and evaluate models

In [None]:
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(type(model).__name__)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred, average='macro'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Verify accuracy using cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("-" * 50)

## 6. Results Comparison
### Visualize and compare results

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset

In [None]:
data = pd.read_csv("dataset/Spruce.csv")

## 1. Exploratory Data Analysis (EDA)


In [None]:
print(data.head())  # Check first few rows
print(data['Tree_Type'].value_counts())  # Check class distribution

### Visualize the class distribution
