In [3]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from datetime import timedelta

df = pd.read_csv("C:\\Users\\Harini CS\\Downloads\\hotel analysis\\hotel_customer_visits_1000.csv")

df['lastvisitdate'] = pd.to_datetime(df['lastvisitdate'], format="%d-%m-%Y")
df['previousvisitdate'] = pd.to_datetime(df['previousvisitdate'], format="%d-%m-%Y")

df['visit_gap'] = (df['lastvisitdate'] - df['previousvisitdate']).dt.days

features = [
    'totalvisityear', 'lastvisitmonth', 'lastvisitweekday',
    'previousvisitmonth', 'previousvisitweekday', 'visits_in_month'
]
X = df[features]
y = df['visit_gap']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(verbose=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

df['predicted_gap'] = model.predict(X).round().astype(int)
df['predicted_next_visitdate'] = df['lastvisitdate'] + pd.to_timedelta(df['predicted_gap'], unit='d')

df.to_csv("C:\\Users\\Harini CS\\Downloads\\hotel analysis\\hotel_customer_visits_with_predictions.csv", index=False)
print("Updated dataset with predictions saved successfully.")

print(df[['customername', 'lastvisitdate', 'predicted_next_visitdate', 'predicted_gap']].head())


MAE: 8.3653200448667
R² Score: 0.5254546285364763
Updated dataset with predictions saved successfully.
  customername lastvisitdate predicted_next_visitdate  predicted_gap
0         cus1    2025-01-24               2025-02-27             34
1         cus2    2025-04-15               2025-05-17             32
2         cus3    2025-04-17               2025-06-06             50
3         cus4    2025-04-14               2025-05-14             30
4         cus5    2025-05-14               2025-06-04             21


In [3]:
month = 6      # June
year = 2025

june_visits = df[
    (df['predicted_next_visitdate'].dt.month == month) &
    (df['predicted_next_visitdate'].dt.year == year)
]

print(f"Number of customers predicted to visit in June {year}: {len(june_visits)}")


Number of customers predicted to visit in June 2025: 167


In [4]:
import joblib
joblib.dump(model, "catboost_model.pkl")


['catboost_model.pkl']

In [4]:
import os
print("📂 Running from directory:", os.getcwd())

📂 Running from directory: c:\Users\Harini CS\Downloads
