**Code for PCA and Model Training**

In [2]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset from an Excel file
df = pd.read_excel('/content/data_transformed_missing-1.xlsx')

# Label encode categorical variables
label_encoder = LabelEncoder()
df['State'] = label_encoder.fit_transform(df['State'])
df['Location'] = label_encoder.fit_transform(df['Location'])
df['Region'] = label_encoder.fit_transform(df['Region'])

# Check for missing values and drop rows with missing target variable
data_clean = df.dropna(subset=['Quantity Sold'])

# Encode categorical variables using get_dummies
data_encoded = pd.get_dummies(data_clean, drop_first=True)

# Separate the features and target variable
X = data_encoded.drop('Quantity Sold', axis=1)
y = data_encoded['Quantity Sold']

# Add a constant term to the independent variables matrix
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Predict the target variable
y_pred = model.predict(X)

# Calculate R-squared
r_squared = r2_score(y, y_pred)

# Calculate Mean Squared Error
mse = mean_squared_error(y, y_pred)

# Print R-squared and MSE
print(f'R-squared: {r_squared}')
print(f'Mean Squared Error: {mse}')

# Print the summary statistics of the model
print(model.summary())

# Save the predictions to an Excel file
predictions_df = pd.DataFrame({
    'Actual Quantity Sold': y,
    'OLS Predictions': y_pred
})
predictions_df.to_excel('ols_model_predictions.xlsx', index=False)
files.download('ols_model_predictions.xlsx')

R-squared: 0.4424198885999937
Mean Squared Error: 553797.3297411829
                            OLS Regression Results                            
Dep. Variable:          Quantity Sold   R-squared:                       0.442
Model:                            OLS   Adj. R-squared:                  0.404
Method:                 Least Squares   F-statistic:                     11.60
Date:                Mon, 18 Dec 2023   Prob (F-statistic):           5.61e-22
Time:                        04:53:02   Log-Likelihood:                -2015.8
No. Observations:                 251   AIC:                             4066.
Df Residuals:                     234   BIC:                             4126.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------

NameError: ignored

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

# Load the dataset
file_path = '/content/data_transformed_missing-1.xlsx'
data = pd.read_excel(file_path)

# Check for missing values and drop rows with missing target variable
data_clean = data.dropna(subset=['Quantity Sold'])

# Encode categorical variables using get_dummies
data_encoded = pd.get_dummies(data_clean, drop_first=True)

# Separate the features and target variable
X = data_encoded.drop('Quantity Sold', axis=1)
y = data_encoded['Quantity Sold']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Ridge regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Train Lasso regression model
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Train Random Forest regression model
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)

feature_importance = random_forest_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)
# Create and fit PCR model
pcr_pipeline = make_pipeline(StandardScaler(), PCA(n_components=10), LinearRegression())
pcr_pipeline.fit(X_train, y_train)
y_pred_pcr = pcr_pipeline.predict(X_test)

# Create and fit neural network model
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)
y_pred_nn = mlp_model.predict(X_test_scaled)

# Calculate R-squared and MSE for each model
metrics = {
    'Ridge': {'R2': r2_score(y_test, y_pred_ridge), 'MSE': mean_squared_error(y_test, y_pred_ridge)},
    'Lasso': {'R2': r2_score(y_test, y_pred_lasso), 'MSE': mean_squared_error(y_test, y_pred_lasso)},
    'Random Forest': {'R2': r2_score(y_test, y_pred_rf), 'MSE': mean_squared_error(y_test, y_pred_rf)},
    'PCR': {'R2': r2_score(y_test, y_pred_pcr), 'MSE': mean_squared_error(y_test, y_pred_pcr)},
    'Neural Network': {'R2': r2_score(y_test, y_pred_nn), 'MSE': mean_squared_error(y_test, y_pred_nn)}
}

# Output the performance
for model, values in metrics.items():
    print(f'{model} - R2: {values["R2"]}, MSE: {values["MSE"]}')

# Create a DataFrame for predictions
predictions_df = pd.DataFrame({
    'Actual Quantity Sold': y_test,
    'Ridge Predictions': y_pred_ridge,
    'Lasso Predictions': y_pred_lasso,
    'Random Forest Predictions': y_pred_rf,
    'PCR Predictions': y_pred_pcr,
    'Neural Network Predictions': y_pred_nn
})

# Save the predictions to an Excel file
predictions_df.to_excel('burger_sales_predictions.xlsx', index=False)
files.download('burger_sales_predictions.xlsx')

**Code for Finding Best Features**

In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=26)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Print variable names and explained variance ratio for each principal component
print("Variable Name - Explained Variance Ratio for each Principal Component:")
for var_name, explained_variance in zip(X.columns, pca.explained_variance_ratio_):
    print(f'{var_name} - {explained_variance}')

NameError: ignored