# Group Project
## NAMES

In [19]:
import polars as pl
import plotly.express as px
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score

## Data Prep

In [5]:
#reading in restaurant specific data
restaurant_accepts = pl.read_csv('chefmozaccepts.csv')
restaurant_cuisine = pl.read_csv('chefmozcuisine.csv')
restaurant_hours = pl.read_csv('chefmozhours4.csv')
restaurant_parking = pl.read_csv('chefmozparking.csv')
restaurant_geo = pl.read_csv('geoplaces2.csv')

#reading in user specific data
user_cuisine = pl.read_csv('usercuisine.csv')
user_payment = pl.read_csv('userpayment.csv')
user_profile = pl.read_csv('userprofile.csv')

#reading in rating data
ratings = pl.read_csv('rating_final.csv')

In [6]:
#joining all restaurant data frames by 'placeID'
restaurant_df = (
    restaurant_geo
    .join(restaurant_accepts, on="placeID", how="left")
    .join(restaurant_cuisine, on="placeID", how="left")
    .join(restaurant_hours, on="placeID", how="left")
    .join(restaurant_parking, on="placeID", how="left")
)
print(restaurant_df)

#joining all user data frames by 'userID'
user_df = (
    user_profile
    .join(user_cuisine, on="userID", how="left")
    .join(user_payment, on="userID", how="left")
)
print(user_df)

shape: (901, 26)
┌─────────┬───────────┬────────────┬────────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ placeID ┆ latitude  ┆ longitude  ┆ the_geom_m ┆ … ┆ Rcuisine ┆ hours     ┆ days      ┆ parking_l │
│ ---     ┆ ---       ┆ ---        ┆ eter       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ot        │
│ i64     ┆ f64       ┆ f64        ┆ ---        ┆   ┆ str      ┆ str       ┆ str       ┆ ---       │
│         ┆           ┆            ┆ str        ┆   ┆          ┆           ┆           ┆ str       │
╞═════════╪═══════════╪════════════╪════════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ 134999  ┆ 18.915421 ┆ -99.184871 ┆ 0101000020 ┆ … ┆ Japanese ┆ 11:00-21: ┆ Mon;Tue;W ┆ none      │
│         ┆           ┆            ┆ 957F000088 ┆   ┆          ┆ 00;       ┆ ed;Thu;Fr ┆           │
│         ┆           ┆            ┆ 568DE35671 ┆   ┆          ┆           ┆ i;        ┆           │
│         ┆           ┆            ┆ …          ┆   ┆          ┆          

In [7]:

print(restaurant_df)
print(restaurant_df.columns)

shape: (901, 26)
┌─────────┬───────────┬────────────┬────────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ placeID ┆ latitude  ┆ longitude  ┆ the_geom_m ┆ … ┆ Rcuisine ┆ hours     ┆ days      ┆ parking_l │
│ ---     ┆ ---       ┆ ---        ┆ eter       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ot        │
│ i64     ┆ f64       ┆ f64        ┆ ---        ┆   ┆ str      ┆ str       ┆ str       ┆ ---       │
│         ┆           ┆            ┆ str        ┆   ┆          ┆           ┆           ┆ str       │
╞═════════╪═══════════╪════════════╪════════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ 134999  ┆ 18.915421 ┆ -99.184871 ┆ 0101000020 ┆ … ┆ Japanese ┆ 11:00-21: ┆ Mon;Tue;W ┆ none      │
│         ┆           ┆            ┆ 957F000088 ┆   ┆          ┆ 00;       ┆ ed;Thu;Fr ┆           │
│         ┆           ┆            ┆ 568DE35671 ┆   ┆          ┆           ┆ i;        ┆           │
│         ┆           ┆            ┆ …          ┆   ┆          ┆          

In [8]:
print(user_df)
print(user_df.columns)

shape: (416, 21)
┌────────┬───────────┬─────────────┬────────┬───┬────────┬────────┬────────────────┬───────────────┐
│ userID ┆ latitude  ┆ longitude   ┆ smoker ┆ … ┆ budget ┆ height ┆ Rcuisine       ┆ Upayment      │
│ ---    ┆ ---       ┆ ---         ┆ ---    ┆   ┆ ---    ┆ ---    ┆ ---            ┆ ---           │
│ str    ┆ f64       ┆ f64         ┆ str    ┆   ┆ str    ┆ f64    ┆ str            ┆ str           │
╞════════╪═══════════╪═════════════╪════════╪═══╪════════╪════════╪════════════════╪═══════════════╡
│ U1001  ┆ 22.139997 ┆ -100.978803 ┆ false  ┆ … ┆ medium ┆ 1.77   ┆ American       ┆ cash          │
│ U1002  ┆ 22.150087 ┆ -100.983325 ┆ false  ┆ … ┆ low    ┆ 1.87   ┆ Mexican        ┆ cash          │
│ U1003  ┆ 22.119847 ┆ -100.946527 ┆ false  ┆ … ┆ low    ┆ 1.69   ┆ Mexican        ┆ cash          │
│ U1004  ┆ 18.867    ┆ -99.183     ┆ false  ┆ … ┆ medium ┆ 1.53   ┆ Bakery         ┆ cash          │
│ U1004  ┆ 18.867    ┆ -99.183     ┆ false  ┆ … ┆ medium ┆ 1.53   ┆ Bakery

In [12]:
user_df.head()

userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height,Rcuisine,Upayment
str,f64,f64,str,str,str,str,str,str,str,i64,str,str,str,str,str,i64,str,f64,str,str
"""U1001""",22.139997,-100.978803,"""false""","""abstemious""","""informal""","""family""","""on foot""","""single""","""independent""",1989,"""variety""","""thrifty-protector""","""none""","""student""","""black""",69,"""medium""",1.77,"""American""","""cash"""
"""U1002""",22.150087,-100.983325,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1990,"""technology""","""hunter-ostentatious""","""Catholic""","""student""","""red""",40,"""low""",1.87,"""Mexican""","""cash"""
"""U1003""",22.119847,-100.946527,"""false""","""social drinker""","""formal""","""family""","""public""","""single""","""independent""",1989,"""none""","""hard-worker""","""Catholic""","""student""","""blue""",60,"""low""",1.69,"""Mexican""","""cash"""
"""U1004""",18.867,-99.183,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1940,"""variety""","""hard-worker""","""none""","""professional""","""green""",44,"""medium""",1.53,"""Bakery""","""cash"""
"""U1004""",18.867,-99.183,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1940,"""variety""","""hard-worker""","""none""","""professional""","""green""",44,"""medium""",1.53,"""Bakery""","""bank_debit_cards"""


In [27]:
restaurant_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Mon;Tue;Wed;Thu;Fri;,none
1,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Sat;,none
2,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Sun;,none
3,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,?,familiar,f,open,none,cash,Mexican,09:00-12:00;,Mon;Tue;Wed;Thu;Fri;,none
4,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,?,familiar,f,open,none,cash,Mexican,09:00-12:00;,Sat;,none


In [26]:
restaurant_df = pd.DataFrame(restaurant_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Mon;Tue;Wed;Thu;Fri;,none
1,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Sat;,none
2,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,kikucuernavaca.com.mx,familiar,f,closed,none,,Japanese,11:00-21:00;,Sun;,none
3,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,?,familiar,f,open,none,cash,Mexican,09:00-12:00;,Mon;Tue;Wed;Thu;Fri;,none
4,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,?,familiar,f,open,none,cash,Mexican,09:00-12:00;,Sat;,none


## Model creation

In [3]:
# Create K-Means Clustering Pipeline
def create_pipeline(num_clusters, random_seed=42):
    """
    Creates a machine learning pipeline with a scaler and KMeans.
    """
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('kmeans', KMeans(n_clusters=num_clusters, random_state=random_seed))
    ])
    return pipeline

# Determining Optimal Clusters with the Elbow Method
def calculate_totwithinss(ratings_df, k):
    """
    Fits a KMeans pipeline and returns the total within-cluster sum of squares.
    """
    kmeans_pipeline = create_pipeline(k, random_seed=10)
    # Convert Polars DataFrame to NumPy array
    ratings_np = ratings_df.to_numpy()
    kmeans_pipeline.fit(ratings_np)
    return kmeans_pipeline['kmeans'].inertia_

In [None]:
# Evaluate for k 
k_values = range(1, 10)
totwithinss_values = [calculate_totwithinss(ratings_df, k) for k in k_values]

# Create a DataFrame for results
kmeans_results = pl.DataFrame({
    'num_clusters': k_values,
    'tot_withinss': totwithinss_values
})


In [None]:
# Plot the elbow method using Plotly Express
elbow_plot = px.line(
    data_frame=kmeans_results.to_pandas(),
    x='num_clusters',
    y='tot_withinss',
    markers=True,
    labels={
        'num_clusters': 'Number of Clusters',
        'tot_withinss': 'Total Within SS'
    },
    title='Elbow Method for Optimal k'
)

elbow_plot.show()

In [None]:
# Final K-Means Clustering
# Choose the number of clusters based on the elbow method
optimal_k = 2

# Create and fit the final pipeline
rating_kmeans_pipeline = create_pipeline(optimal_k)
rating_kmeans_pipeline.fit(ratings_df.to_numpy())

# Add cluster labels back to Polars DataFrame (1-indexed and categorical)
rating_with_clusters = ratings_df.with_columns(
    pl.Series(
        name="segment_number",
        values=(rating_kmeans_pipeline['kmeans'].labels_ + 1).astype(str)
    ).cast(pl.Categorical)
)

# Show the results
print(rating_with_clusters.head())

In [None]:
# Calculate summary statistics for each segment
# Segment Analysis
segment_summary = rating_with_clusters.group_by('rating').agg(
    [
        pl.mean('rating').alias('mean_rating'),
        pl.mean('food_rating').alias('mean_food_rating'),
        pl.mean('service_rating').alias('mean_service_rating'),
        pl.len().alias('n')
    ]
)

segment_summary

## Model Selection

In [None]:
# Convert Polars to Pandas
customer_data_transformed_df = customer_data_transformed.to_pandas()

# Encode target variable
label_encoder = LabelEncoder()
customer_data_transformed_df['made_purchase'] = label_encoder.fit_transform(
    customer_data_transformed_df['made_purchase']
)

In [None]:
# Define predictors and target
features = ['most_recent_transaction', 'log_number_of_transactions', 'log_total_revenue']
target = 'made_purchase'

# Preprocessing
categorical_features = []
numeric_features = features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [None]:
# --- Split Data ---
train, test = train_test_split(customer_data_transformed_df, test_size=0.3, random_state=101)

# --- Define Models ---
models = {
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=101)
}

In [None]:
accuracy_scores = {}

# --- Train and Evaluate Models ---
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(train[features], train[target])
    test['pred_purchase'] = pipeline.predict(test[features])
    
    # Metrics
    target_names = [str(name) for name in label_encoder.classes_]
    conf_matrix = confusion_matrix(test[target], test['pred_purchase'])
    accuracy = accuracy_score(test[target], test['pred_purchase'])
    accuracy_scores[model_name] = accuracy

    print(f"\nModel: {model_name}")
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:", accuracy)

In [None]:
# --- Select Best Model ---
best_model_name = max(accuracy_scores, key=accuracy_scores.get)
best_accuracy_score = accuracy_scores[best_model_name]
best_model = models[best_model_name]

print(f"\n✅ Best Model: {best_model_name} (Accuracy: {best_accuracy_score:.4f})")

# --- Retrain Best Model and Predict Probabilities ---
best_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])
best_model_pipeline.fit(train[features], train[target])

In [None]:
# Predictions
predicted_outcome = best_model_pipeline.predict(test[features])
predicted_prob = best_model_pipeline.predict_proba(test[features])[:, 1]

# Add to test set
test['pred_purchase'] = predicted_outcome
test['pred_probability'] = predicted_prob

In [None]:
# Convert back to Polars DataFrame
customer_test = pl.DataFrame(test)

# View the first few rows
print(customer_test.head())