In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import datetime

from src.load_data import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X_train, y_train = get_train_data()
X_test = get_test_data()

In [None]:
X_train

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
accuracy_train = accuracy_score(y_train, lr_model.predict(X_train))
print(f"Train accuracy: {accuracy_train:.4f}")

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_tr, X_v, y_tr, y_v = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train
)

dt_model = DecisionTreeClassifier()
dt_model.fit(X_tr, y_tr)
accuracy_tr = accuracy_score(y_tr, dt_model.predict(X_tr))
accuracy_v = accuracy_score(y_v, dt_model.predict(X_v))
print(f"Accuracy on training data: {accuracy_tr:.4f}")
print(f"Accuracy on validation data: {accuracy_v:.4f}")

# timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# pd.DataFrame({
#     "Id": X_test.index,
#     "Cover_Type": dt_model.predict(X_test)
# }).to_csv(f"output/dt_{timestamp}.csv", index=False)

In [None]:
from lazypredict.Supervised import LazyClassifier

# Split data
X_tr, X_t, y_tr, y_t = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# Run LazyPredict
clf = LazyClassifier(ignore_warnings=False, custom_metric=None)
models, predictions = clf.fit(X_tr, X_t, y_tr, y_t)

print(models)

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_tr, X_v, y_tr, y_v = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train
)

rf_model = RandomForestClassifier()
rf_model.fit(X_tr, y_tr)
accuracy_tr = accuracy_score(y_tr, rf_model.predict(X_tr))
accuracy_v = accuracy_score(y_v, rf_model.predict(X_v))
print(f"Accuracy on training data: {accuracy_tr:.4f}")
print(f"Accuracy on validation data: {accuracy_v:.4f}")

# timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# pd.DataFrame({
#     "Id": X_test.index,
#     "Cover_Type": rf_model.predict(X_test)
# }).to_csv(f"output/rf_{timestamp}.csv", index=False)

In [None]:
pd.DataFrame(
    {"feature": X_train.columns, "importance": rf_model.feature_importances_}
).set_index("feature").sort_values("importance", ascending=False)

In [None]:
# sns.heatmap(
#     pd.DataFrame(
#         {"feature": X_train.columns, "importance": rf_model.feature_importances_}
#     )
#     .set_index("feature")
#     .sort_values("importance", ascending=False)
# )

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

le = LabelEncoder()
y_encoded = le.fit_transform(y_train)

print("Encoded labels:", np.unique(y_encoded))
print("Number of classes:", len(le.classes_))


xgb = XGBClassifier()

X_tr, X_t, y_tr, y_t = train_test_split(
    X_train, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

xgb.fit(X_tr, y_tr)
y_pred = xgb.predict(X_t)

from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_t, y_pred))

# timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# pd.DataFrame({"Id": X_test.index, "Cover_Type": xgb.predict(X_test) + 1}).to_csv(
#     f"output/xgb_{timestamp}.csv", index=False
# )

In [None]:
from lightgbm import LGBMClassifier

X_tr, X_v, y_tr, y_v = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train
)

lgbm_model = LGBMClassifier()
lgbm_model.fit(X_tr, y_tr)
accuracy_tr = accuracy_score(y_tr, lgbm_model.predict(X_tr))
accuracy_v = accuracy_score(y_v, lgbm_model.predict(X_v))
print(f"Accuracy on training data: {accuracy_tr:.4f}")
print(f"Accuracy on validation data: {accuracy_v:.4f}")

# timestamp = datetime.datetime.now().stlgbmtime("%Y%m%d-%H%M%S")
# pd.DataFrame({
#     "Id": X_test.index,
#     "Cover_Type": lgbm_model.predict(X_test)
# }).to_csv(f"output/lgbm_{timestamp}.csv", index=False)

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
    ("lgbm", LGBMClassifier(n_estimators=300, random_state=42)),
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    stack_method="predict_proba",
)

stack.fit(X_train, y_train)

# timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# pd.DataFrame({"Id": X_test.index, "Cover_Type": stack.predict(X_test)}).to_csv(
#     f"output/stack_{timestamp}.csv", index=False
# )

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

# Base models
base_estimators = [
    (
        "cat",
        CatBoostClassifier(
            loss_function="MultiClass",
            verbose=0,
        ),
    ),
    (
        "lgbm",
        LGBMClassifier(
            n_estimators=500,
            max_depth=-1,
        ),
    ),
    (
        "xgb",
        XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
        ),
    ),
    (
        "et",
        ExtraTreesClassifier(
            n_estimators=500,
        ),
    ),
    ("rf", RandomForestClassifier(n_estimators=500)),
]

meta_model = LogisticRegression()

stack = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5),
    stack_method="predict_proba",
    n_jobs=-1,
)

# Train stacking ensemble
stack.fit(X_train, y_train)

# timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# pd.DataFrame({"Id": X_test.index, "Cover_Type": stack.predict(X_test)}).to_csv(
#     f"output/stack_{timestamp}.csv", index=False
# )