In [None]:
# =====================================================
# 1) IMPORTS
# =====================================================
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt

# =====================================================
# 2) LOAD DU DATASET
# =====================================================
df = pd.read_csv("Worldwide Travel Cities.csv")

# =====================================================
# (Ton prétraitement est déjà fait selon ta consigne)
# Ici, on encode le budget et transforme "pertinence"
# =====================================================

# Exemple: créer une variable target (1 = pertinent quand budget = user's budget)
df["target"] = (df["budget_level"] != "Luxury").astype(int)  # <--- à modifier selon ton projet

# encode variables catégoriques
label_cols = ["country", "region", "budget_level"]
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# features pour modèles
X = df[["budget_level", "culture", "adventure", "nature", "beaches", "nightlife", "cuisine", "wellness", "urban", "seclusion"]]
y = df["target"]

# =====================================================
# 3) SPLIT TRAIN / TEST
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================================================
# 4) MODEL 1 : LOGISTIC REGRESSION (classification)
# =====================================================
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
pred1 = log_reg.predict(X_test)

print("\n Logistic Regression")
print("Accuracy:", accuracy_score(y_test, pred1))
print(classification_report(y_test, pred1))

# =====================================================
# 5) MODEL 2 : RANDOM FOREST CLASSIFIER (classification)
# =====================================================
rf_clf = RandomForestClassifier(n_estimators=200)
rf_clf.fit(X_train, y_train)
pred2 = rf_clf.predict(X_test)

print("\n Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, pred2))
print(classification_report(y_test, pred2))


# =====================================================
# 6) MODEL 3 : RANDOM FOREST REGRESSOR (régression)
# Objectif: prédire un score de pertinence pour ranking
# =====================================================

# Exemple: score de pertinence = somme des activités
df["score"] = df[["culture","adventure","nature","beaches","nightlife","cuisine","wellness","urban","seclusion"]].sum(axis=1)

y_reg = df["score"]
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

rf_reg = RandomForestRegressor(n_estimators=200)
rf_reg.fit(X_train_r, y_train_r)
pred3 = rf_reg.predict(X_test_r)

print("\n Random Forest Regressor")
print("RMSE:", mean_squared_error(y_test_r, pred3, squared=False))
print("R²:", r2_score(y_test_r, pred3))

# =====================================================
# 7) PRÉDICTION POUR L'UTILISATEUR
# (à connecter avec formulaire Streamlit plus tard)
# =====================================================
example_user = [[2, 5, 4, 4, 3, 3, 5, 3, 4, 2]]  # <-- remplacer avec input utilisateur
print("\nDestination pertinente ? (0 = non, 1 = oui)")
print(rf_clf.predict(example_user))
print("\nScore de pertinence prédictif :")
print(rf_reg.predict(example_user))
