<a href="https://colab.research.google.com/github/jxin22/Jupyter-Notebook/blob/main/Model_Compare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score

# Read the cleaned data
data = pd.read_csv("chipotle_cleaned_data (1).csv")

# Extract satisfaction score (OSAT) as the target variable
y = data['OSAT']

# Extract the 40 satisfaction assessment items from the questionnaire as feature variables
feature_columns = ['A1_Lo', 'A1_L1', 'A1_L2', 'A1_L3', 'A1_L4', 'A1_L5', 'A1_L6', 'A1_L7', 'A1_L8', 'A1_L9',
                   'A1_10', 'A1_11', 'A1_12', 'A1_13', 'A1_14', 'A1_15', 'A1_16', 'A1_17', 'A1_18', 'A1_19',
                   'A1_20', 'A1_21', 'A1_22', 'A1_23', 'A1_24', 'A1_25', 'A1_26', 'A1_27', 'A1_28', 'A1_29',
                   'A1_30', 'A1_31', 'A1_32', 'A1_33', 'A1_34', 'A1_35', 'A1_36', 'A1_37', 'A1_38', 'A1_39']
X = data[feature_columns]

# Fill missing values with mean
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=feature_columns)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
pcr = PCA(n_components=10)
plsr = PLSRegression(n_components=10)

# Train and evaluate models using cross-validation
models = [
    ('Ridge Regression', ridge),
    ('Lasso Regression', lasso),
    ('Elastic Net', elastic_net),
    ('Principal Component Regression', pcr),
    ('Partial Least Squares Regression', plsr)
]

for name, model in models:
    if name in ['Principal Component Regression', 'Partial Least Squares Regression']:
        model.fit(X_train, y_train)
        X_train_transformed = model.transform(X_train)
        X_test_transformed = model.transform(X_test)
        ridge_cv = Ridge(alpha=1.0)
        scores = cross_val_score(ridge_cv, X_train_transformed, y_train, cv=5, scoring='neg_mean_squared_error')
    else:
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

    print(f"{name}:")
    print("Cross-validation scores:", -scores)
    print("Mean cross-validation score:", -scores.mean())

    if name in ['Principal Component Regression', 'Partial Least Squares Regression']:
        y_pred = ridge_cv.fit(X_train_transformed, y_train).predict(X_test_transformed)
    else:
        y_pred = model.fit(X_train, y_train).predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("Test MSE:", mse)
    print("Test R^2:", r2)
    print()

# Get the feature coefficients for Ridge Regression
ridge.fit(X_train, y_train)
coefficients = pd.DataFrame({'Feature': feature_columns, 'Coefficient': ridge.coef_})
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)

print("Key Drivers and their Coefficients (Ridge Regression):")
print(coefficients)

Ridge Regression:
Cross-validation scores: [0.81688688 1.3827237  0.88930528 0.85437809 1.04224932]
Mean cross-validation score: 0.9971086527967824
Test MSE: 0.6310725469007388
Test R^2: 0.6777387611131374

Lasso Regression:
Cross-validation scores: [0.81276165 1.37643898 0.80274433 0.88399689 0.97675573]
Mean cross-validation score: 0.9705395148529481
Test MSE: 0.6067297373853577
Test R^2: 0.6901695727384252

Elastic Net:
Cross-validation scores: [0.79531577 1.37609312 0.81482565 0.86105826 0.98099765]
Mean cross-validation score: 0.9656580882074511
Test MSE: 0.5943734412169908
Test R^2: 0.6964793945343926

Principal Component Regression:
Cross-validation scores: [0.83336096 1.27928421 0.8128478  0.94402489 0.99160779]
Mean cross-validation score: 0.9722251306405628
Test MSE: 0.6326874746708268
Test R^2: 0.6769140879017019

Partial Least Squares Regression:
Cross-validation scores: [0.70368361 1.16977425 0.6813734  0.72244099 0.8486172 ]
Mean cross-validation score: 0.8251778916135073

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

# Read the cleaned data
data = pd.read_csv("chipotle_cleaned_data (1).csv")

# Extract satisfaction score (OSAT) as the target variable
y = data['OSAT']

# Extract the 40 satisfaction assessment items from the questionnaire as feature variables
feature_columns = ['A1_Lo', 'A1_L1', 'A1_L2', 'A1_L3', 'A1_L4', 'A1_L5', 'A1_L6', 'A1_L7', 'A1_L8', 'A1_L9',
                   'A1_10', 'A1_11', 'A1_12', 'A1_13', 'A1_14', 'A1_15', 'A1_16', 'A1_17', 'A1_18', 'A1_19',
                   'A1_20', 'A1_21', 'A1_22', 'A1_23', 'A1_24', 'A1_25', 'A1_26', 'A1_27', 'A1_28', 'A1_29',
                   'A1_30', 'A1_31', 'A1_32', 'A1_33', 'A1_34', 'A1_35', 'A1_36', 'A1_37', 'A1_38', 'A1_39']
X = data[feature_columns]

# Fill missing values with mean
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=feature_columns)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the range of hyperparameters to search
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Create an instance of ElasticNet
elastic_net = ElasticNet()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Test MSE:", mse)
print("Test R^2:", r2)

# Get the feature coefficients
coefficients = pd.DataFrame({'Feature': feature_columns, 'Coefficient': best_model.coef_})
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)

print("Key Drivers and their Coefficients (Elastic Net):")
print(coefficients)

Best parameters: {'alpha': 0.1, 'l1_ratio': 0.3}
Test MSE: 0.590461480560833
Test R^2: 0.698477062304479
Key Drivers and their Coefficients (Elastic Net):
   Feature  Coefficient
2    A1_L2     0.152874
1    A1_L1     0.147132
13   A1_13     0.112891
0    A1_Lo     0.108284
14   A1_14     0.106064
19   A1_19     0.084204
39   A1_39     0.066624
33   A1_33     0.063970
17   A1_17     0.063316
35   A1_35     0.060135
18   A1_18     0.051598
34   A1_34     0.036614
21   A1_21     0.031319
24   A1_24     0.029004
20   A1_20     0.022114
25   A1_25     0.019915
11   A1_11     0.016628
6    A1_L6     0.014507
36   A1_36     0.000000
30   A1_30     0.000000
28   A1_28    -0.000000
38   A1_38     0.000000
27   A1_27     0.000000
26   A1_26     0.000000
31   A1_31     0.000000
7    A1_L7    -0.000000
8    A1_L8    -0.000000
22   A1_22    -0.000000
3    A1_L3     0.000000
4    A1_L4     0.000000
16   A1_16     0.000000
10   A1_10    -0.000000
9    A1_L9     0.000000
23   A1_23     0.000000
15   

In [None]:
import pandas as pd

# Store Coefficients to Dictionary
coefficients = pd.DataFrame({
    'Feature': ['A1_L2', 'A1_L1', 'A1_13', 'A1_Lo', 'A1_14', 'A1_19', 'A1_39', 'A1_33', 'A1_17', 'A1_35',
                'A1_18', 'A1_34', 'A1_21', 'A1_24', 'A1_20', 'A1_25', 'A1_11', 'A1_L6', 'A1_36', 'A1_30',
                'A1_28', 'A1_38', 'A1_27', 'A1_26', 'A1_31', 'A1_L7', 'A1_L8', 'A1_22', 'A1_L3', 'A1_L4',
                'A1_16', 'A1_10', 'A1_L9', 'A1_23', 'A1_15', 'A1_37', 'A1_32', 'A1_12', 'A1_L5', 'A1_29'],
    'Coefficient': [0.152874, 0.147132, 0.112891, 0.108284, 0.106064, 0.084204, 0.066624, 0.063970, 0.063316,
                    0.060135, 0.051598, 0.036614, 0.031319, 0.029004, 0.022114, 0.019915, 0.016628, 0.014507,
                    0.000000, 0.000000, -0.000000, 0.000000, 0.000000, 0.000000, 0.000000, -0.000000, -0.000000,
                    -0.000000, 0.000000, 0.000000, 0.000000, -0.000000, 0.000000, 0.000000, -0.015750, -0.026851,
                    -0.028529, -0.032299, -0.064901, -0.086442]
})

def satisfaction_simulator(improvements):
    impact_percentage = 0
    for feature, improvement_percentage in improvements.items():
        coefficient = coefficients.loc[coefficients['Feature'] == feature, 'Coefficient'].values[0]
        impact_percentage += coefficient * improvement_percentage / 100
    return impact_percentage * 100

In [None]:
# Example
improvements = {
    'A1_L2': 10,
    'A1_L1': 5,
    'A1_13': 8
}

impact_percentage = satisfaction_simulator(improvements)
print(f"Overall satisfaction is expected to increase: {impact_percentage:.2f}%")

Overall satisfaction is expected to increase: 3.17%
