In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import pickle
import os

In [5]:
# Function to load data
def load_data():
    try:
        X = pd.read_feather("./data/train_data.feather")
        y = pd.read_feather("./data/train_label.feather")['status_group']
    except FileNotFoundError:
        X = pd.read_csv("./data/train_data.csv", parse_dates=["date_recorded"])
        y = pd.read_csv("./data/train_label.csv")['status_group']

        # Preprocessing
        X.drop(["wpt_name", "subvillage"], axis=1, inplace=True)
        
        # Handling categorical data
        X = pd.get_dummies(X, drop_first=True)
        
        # Convert labels to numeric
        label_mapping = {'functional': 0, 'non functional': 1, 'functional needs repair': 2}
        y.replace(label_mapping, inplace=True)

        # Save preprocessed data
        X.to_feather("./data/train_data.feather")
        y.to_feather("./data/train_label.feather")
    
    return X, y
    
# Load data
X, y = load_data()

In [3]:
# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple Imputer for missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

# Decision Tree Model
dtree = DecisionTreeClassifier(max_depth=3, random_state=42)
dtree.fit(X_train, y_train)

# Random Forest Model
forest_model_path = "./models/forest_0.pickle"
if os.path.exists(forest_model_path):
    with open(forest_model_path, "rb") as f:
        forest = pickle.load(f)
else:
    forest = RandomForestClassifier(max_depth=4, max_features="log2", 
                                    bootstrap=True, max_samples=1000, 
                                    n_jobs=-1, class_weight="balanced_subsample", 
                                    random_state=51)
    forest.fit(X_train, y_train)
    with open(forest_model_path, "wb") as f:
        pickle.dump(forest, f)

# Evaluation function
def evaluate_model(model, X_val, y_val):
    predictions = model.predict(X_val)
    print("Classification Report:\n", classification_report(y_val, predictions))
    cm = confusion_matrix(y_val, predictions)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Evaluate models
evaluate_model(dtree, X_val, y_val)
evaluate_model(forest, X_val, y_val)

TypeError: float() argument must be a string or a number, not 'Timestamp'