In [None]:
# File: group2_script.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# Load the dataset
df = pd.read_csv("REAL_DATA.csv")

# Display basic information about the dataset
print("Dataset Info:")
df.info()

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

# Feature engineering
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['is_weekend'] = df['day_of_week'].isin([6, 7]).astype(int)

# Drop rows with invalid state_holiday values
valid_holidays = ['0', 'a', 'b', 'c']
df = df[df['state_holiday'].isin(valid_holidays)]

# Encode 'state_holiday' as dummy variables
df = pd.get_dummies(df, columns=['state_holiday'], drop_first=True)

# Drop rows where the store is closed (open = 0)
df = df[df['open'] == 1]

# Simulate the 'sales' column if it doesn't exist
if 'sales' not in df.columns:
    print("The 'sales' column is missing. Simulating sales data for training...")
    df['sales'] = df['nb_customers_on_day'] * 10  # Example formula for sales

# Split the dataset into features (X) and target (y)
X = df.drop(columns=['sales', 'date', 'open'])
y = df['sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
print("\nTraining Random Forest Regressor...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#what are n_estimators and random_state?


# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_val)
r2_rf = r2_score(y_val, y_pred_rf)
print(f"Validation R² Score (Random Forest): {r2_rf}")

# Train an XGBoost Regressor
print("\nTraining XGBoost Regressor...")
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the XGBoost model
y_pred_xgb = xgb_model.predict(X_val)
r2_xgb = r2_score(y_val, y_pred_xgb)
print(f"Validation R² Score (XGBoost): {r2_xgb}")

# Predict sales for the entire dataset using the XGBoost model
print("\nPredicting sales for the entire dataset...")
df['predicted_sales'] = xgb_model.predict(X)

# Save the predictions to a new CSV file
df.to_csv("REAL_DATA_with_predictions.csv", index=False)
print("Predicted sales have been saved to 'REAL_DATA_with_predictions.csv'.")

# Save the R² score of the XGBoost model to a file
with open("r2.txt", "w") as f:
    f.write(f"XGBoost R² Score: {r2_xgb}")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71205 entries, 0 to 71204
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   index                71205 non-null  int64 
 1   store_ID             71205 non-null  int64 
 2   day_of_week          71205 non-null  int64 
 3   date                 71205 non-null  object
 4   nb_customers_on_day  71205 non-null  int64 
 5   open                 71205 non-null  int64 
 6   promotion            71205 non-null  int64 
 7   state_holiday        71205 non-null  object
 8   school_holiday       71205 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 4.9+ MB
The 'sales' column is missing. Simulating sales data for training...

Training Random Forest Regressor...
Validation R² Score (Random Forest): 0.9999857420545276

Training XGBoost Regressor...
Validation R² Score (XGBoost): 0.9960778951644897

Predicting sales for the entire dataset...
Pr