# Group Project
## NAMES

In [31]:
import polars as pl
import plotly.express as px
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score
import pyarrow

## Data Prep

In [32]:
#reading in restaurant specific data
restaurant_accepts = pl.read_csv('chefmozaccepts.csv')
restaurant_cuisine = pl.read_csv('chefmozcuisine.csv')
restaurant_hours = pl.read_csv('chefmozhours4.csv')
restaurant_parking = pl.read_csv('chefmozparking.csv')
restaurant_geo = pl.read_csv('geoplaces2.csv')

#reading in user specific data
user_cuisine = pl.read_csv('usercuisine.csv')
user_payment = pl.read_csv('userpayment.csv')
user_profile = pl.read_csv('userprofile.csv')

#reading in rating data
ratings = pl.read_csv('rating_final.csv')

In [33]:
#joining all restaurant data frames by 'placeID'
restaurant_df = (
    restaurant_geo
    .join(restaurant_accepts, on="placeID", how="left")
    .join(restaurant_cuisine, on="placeID", how="left")
    .join(restaurant_hours, on="placeID", how="left")
    .join(restaurant_parking, on="placeID", how="left")
)
print(restaurant_df)

#joining all user data frames by 'userID'
user_df = (
    user_profile
    .join(user_cuisine, on="userID", how="left")
    .join(user_payment, on="userID", how="left")
)
print(user_df)

shape: (901, 26)
┌─────────┬───────────┬────────────┬────────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ placeID ┆ latitude  ┆ longitude  ┆ the_geom_m ┆ … ┆ Rcuisine ┆ hours     ┆ days      ┆ parking_l │
│ ---     ┆ ---       ┆ ---        ┆ eter       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ot        │
│ i64     ┆ f64       ┆ f64        ┆ ---        ┆   ┆ str      ┆ str       ┆ str       ┆ ---       │
│         ┆           ┆            ┆ str        ┆   ┆          ┆           ┆           ┆ str       │
╞═════════╪═══════════╪════════════╪════════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ 134999  ┆ 18.915421 ┆ -99.184871 ┆ 0101000020 ┆ … ┆ Japanese ┆ 11:00-21: ┆ Mon;Tue;W ┆ none      │
│         ┆           ┆            ┆ 957F000088 ┆   ┆          ┆ 00;       ┆ ed;Thu;Fr ┆           │
│         ┆           ┆            ┆ 568DE35671 ┆   ┆          ┆           ┆ i;        ┆           │
│         ┆           ┆            ┆ …          ┆   ┆          ┆          

In [34]:

print(restaurant_df)
print(restaurant_df.columns)

shape: (901, 26)
┌─────────┬───────────┬────────────┬────────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ placeID ┆ latitude  ┆ longitude  ┆ the_geom_m ┆ … ┆ Rcuisine ┆ hours     ┆ days      ┆ parking_l │
│ ---     ┆ ---       ┆ ---        ┆ eter       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ot        │
│ i64     ┆ f64       ┆ f64        ┆ ---        ┆   ┆ str      ┆ str       ┆ str       ┆ ---       │
│         ┆           ┆            ┆ str        ┆   ┆          ┆           ┆           ┆ str       │
╞═════════╪═══════════╪════════════╪════════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ 134999  ┆ 18.915421 ┆ -99.184871 ┆ 0101000020 ┆ … ┆ Japanese ┆ 11:00-21: ┆ Mon;Tue;W ┆ none      │
│         ┆           ┆            ┆ 957F000088 ┆   ┆          ┆ 00;       ┆ ed;Thu;Fr ┆           │
│         ┆           ┆            ┆ 568DE35671 ┆   ┆          ┆           ┆ i;        ┆           │
│         ┆           ┆            ┆ …          ┆   ┆          ┆          

In [35]:
print(user_df)
print(user_df.columns)

shape: (416, 21)
┌────────┬───────────┬─────────────┬────────┬───┬────────┬────────┬────────────────┬───────────────┐
│ userID ┆ latitude  ┆ longitude   ┆ smoker ┆ … ┆ budget ┆ height ┆ Rcuisine       ┆ Upayment      │
│ ---    ┆ ---       ┆ ---         ┆ ---    ┆   ┆ ---    ┆ ---    ┆ ---            ┆ ---           │
│ str    ┆ f64       ┆ f64         ┆ str    ┆   ┆ str    ┆ f64    ┆ str            ┆ str           │
╞════════╪═══════════╪═════════════╪════════╪═══╪════════╪════════╪════════════════╪═══════════════╡
│ U1001  ┆ 22.139997 ┆ -100.978803 ┆ false  ┆ … ┆ medium ┆ 1.77   ┆ American       ┆ cash          │
│ U1002  ┆ 22.150087 ┆ -100.983325 ┆ false  ┆ … ┆ low    ┆ 1.87   ┆ Mexican        ┆ cash          │
│ U1003  ┆ 22.119847 ┆ -100.946527 ┆ false  ┆ … ┆ low    ┆ 1.69   ┆ Mexican        ┆ cash          │
│ U1004  ┆ 18.867    ┆ -99.183     ┆ false  ┆ … ┆ medium ┆ 1.53   ┆ Bakery         ┆ cash          │
│ U1004  ┆ 18.867    ┆ -99.183     ┆ false  ┆ … ┆ medium ┆ 1.53   ┆ Bakery

In [36]:
user_df.head()

userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height,Rcuisine,Upayment
str,f64,f64,str,str,str,str,str,str,str,i64,str,str,str,str,str,i64,str,f64,str,str
"""U1001""",22.139997,-100.978803,"""false""","""abstemious""","""informal""","""family""","""on foot""","""single""","""independent""",1989,"""variety""","""thrifty-protector""","""none""","""student""","""black""",69,"""medium""",1.77,"""American""","""cash"""
"""U1002""",22.150087,-100.983325,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1990,"""technology""","""hunter-ostentatious""","""Catholic""","""student""","""red""",40,"""low""",1.87,"""Mexican""","""cash"""
"""U1003""",22.119847,-100.946527,"""false""","""social drinker""","""formal""","""family""","""public""","""single""","""independent""",1989,"""none""","""hard-worker""","""Catholic""","""student""","""blue""",60,"""low""",1.69,"""Mexican""","""cash"""
"""U1004""",18.867,-99.183,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1940,"""variety""","""hard-worker""","""none""","""professional""","""green""",44,"""medium""",1.53,"""Bakery""","""cash"""
"""U1004""",18.867,-99.183,"""false""","""abstemious""","""informal""","""family""","""public""","""single""","""independent""",1940,"""variety""","""hard-worker""","""none""","""professional""","""green""",44,"""medium""",1.53,"""Bakery""","""bank_debit_cards"""


In [37]:
restaurant_df.head()

placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services,Rpayment,Rcuisine,hours,days,parking_lot
i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
134999,18.915421,-99.184871,"""0101000020957F000088568DE35671…","""Kiku Cuernavaca""","""Revolucion""","""Cuernavaca""","""Morelos""","""Mexico""","""?""","""?""","""No_Alcohol_Served""","""none""","""informal""","""no_accessibility""","""medium""","""kikucuernavaca.com.mx""","""familiar""","""f""","""closed""","""none""",,"""Japanese""","""11:00-21:00;""","""Mon;Tue;Wed;Thu;Fri;""","""none"""
134999,18.915421,-99.184871,"""0101000020957F000088568DE35671…","""Kiku Cuernavaca""","""Revolucion""","""Cuernavaca""","""Morelos""","""Mexico""","""?""","""?""","""No_Alcohol_Served""","""none""","""informal""","""no_accessibility""","""medium""","""kikucuernavaca.com.mx""","""familiar""","""f""","""closed""","""none""",,"""Japanese""","""11:00-21:00;""","""Sat;""","""none"""
134999,18.915421,-99.184871,"""0101000020957F000088568DE35671…","""Kiku Cuernavaca""","""Revolucion""","""Cuernavaca""","""Morelos""","""Mexico""","""?""","""?""","""No_Alcohol_Served""","""none""","""informal""","""no_accessibility""","""medium""","""kikucuernavaca.com.mx""","""familiar""","""f""","""closed""","""none""",,"""Japanese""","""11:00-21:00;""","""Sun;""","""none"""
132825,22.147392,-100.983092,"""0101000020957F00001AD016568C48…","""puesto de tacos""","""esquina santos degollado y leo…","""s.l.p.""","""s.l.p.""","""mexico""","""?""","""78280""","""No_Alcohol_Served""","""none""","""informal""","""completely""","""low""","""?""","""familiar""","""f""","""open""","""none""","""cash""","""Mexican""","""09:00-12:00;""","""Mon;Tue;Wed;Thu;Fri;""","""none"""
132825,22.147392,-100.983092,"""0101000020957F00001AD016568C48…","""puesto de tacos""","""esquina santos degollado y leo…","""s.l.p.""","""s.l.p.""","""mexico""","""?""","""78280""","""No_Alcohol_Served""","""none""","""informal""","""completely""","""low""","""?""","""familiar""","""f""","""open""","""none""","""cash""","""Mexican""","""09:00-12:00;""","""Sat;""","""none"""


## User Data Clustering

In [38]:
user_df = user_df.with_columns([
    pl.when(pl.col(col).cast(str).str.to_lowercase().is_in(["?", "none"]))
    .then(None)
    .otherwise(pl.col(col))
    .alias(col)
    for col in user_df.columns
])
print(len(user_df))
user_df.null_count()

416


userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height,Rcuisine,Upayment
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,3,0,6,6,8,4,13,0,38,0,53,8,0,0,7,0,0,5


In [60]:
joined_user_df = ratings.join(user_df, on="userID", how="inner")
joined_user_df.head()

userID,placeID,rating,food_rating,service_rating,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height,Rcuisine,Upayment
str,i64,i64,i64,i64,f64,f64,str,str,str,str,str,str,str,i64,str,str,str,str,str,i64,str,f64,str,str
"""U1077""",135085,2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""Catholic""","""student""","""blue""",65,"""medium""",1.71,"""Mexican""","""VISA"""
"""U1077""",135085,2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""Catholic""","""student""","""blue""",65,"""medium""",1.71,"""Mexican""","""cash"""
"""U1077""",135085,2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""Catholic""","""student""","""blue""",65,"""medium""",1.71,"""Mexican""","""bank_debit_cards"""
"""U1077""",135038,2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""Catholic""","""student""","""blue""",65,"""medium""",1.71,"""Mexican""","""VISA"""
"""U1077""",135038,2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""Catholic""","""student""","""blue""",65,"""medium""",1.71,"""Mexican""","""cash"""


In [73]:
user_data = joined_user_df.drop(["Rcuisine", "Upayment", "color", "religion", "height", "placeID", "userID"])
user_data.head()


rating,food_rating,service_rating,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,activity,weight,budget
i64,i64,i64,f64,f64,str,str,str,str,str,str,str,i64,str,str,str,i64,str
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium"""
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium"""
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium"""
2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium"""
2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium"""


In [74]:
categorical_cols = [
    "smoker", "drink_level", "dress_preference", "ambience", "transport",
    "marital_status", "hijos", "interest", "personality", "activity", "budget", "rating",
    "food_rating", "service_rating"
]

# One-hot encode
user_data_dummies = user_data.to_dummies(columns=categorical_cols)

In [75]:
df_pd = user_data_dummies.to_pandas()

# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_pd)

In [76]:
# Calculate inertia for k from 1 to 10
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pd)
    inertia.append(kmeans.inertia_)

# Create a DataFrame for Plotly
elbow_df = pd.DataFrame({
    "k": list(k_range),
    "inertia": inertia
})

# Create interactive elbow plot
fig = px.line(elbow_df, x="k", y="inertia", markers=True,
              title="Elbow Method for Optimal k",
              labels={"k": "Number of Clusters", "inertia": "Inertia"})

fig.update_layout(xaxis=dict(dtick=1), template="plotly_white")
fig.show()

In [84]:
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(df_scaled)

# Add cluster labels to the original Polars DataFrame
user_data = user_data.with_columns(pl.Series("cluster", clusters))

In [85]:
user_data.head(10)

rating,food_rating,service_rating,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,activity,weight,budget,cluster
i64,i64,i64,f64,f64,str,str,str,str,str,str,str,i64,str,str,str,i64,str,i32
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,1,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
2,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0
1,2,2,22.156469,-100.98554,"""false""","""social drinker""","""elegant""","""family""","""public""","""married""","""kids""",1987,"""technology""","""thrifty-protector""","""student""",65,"""medium""",0


In [86]:
fig = px.histogram(user_data, x="cluster", title="User Count by Cluster Segment")
fig.show()

In [87]:
segment_summary = user_data.group_by('cluster').agg([
    pl.mean('latitude').alias('mean_latitude'),
    pl.mean('longitude').alias('mean_longitude'),
    pl.mean('birth_year').alias('mean_birth_year'),
    pl.mean('weight').alias('mean_weight'),
    pl.len().alias('n_users')
])

segment_summary



cluster,mean_latitude,mean_longitude,mean_birth_year,mean_weight,n_users
i32,f64,f64,f64,f64,u32
2,22.284009,-100.727253,1986.410233,72.395871,1114
0,21.422055,-100.409068,1979.871746,60.565079,1575
1,22.170335,-100.950199,1988.023368,65.997938,1455


In [88]:
# List of categorical columns
categorical_columns = ["smoker", "drink_level", "dress_preference", "ambience", "transport",
    "marital_status", "hijos", "interest", "personality", "activity", "budget", "rating",
    "food_rating", "service_rating"]

# Initialize list to store distributions
category_distribution = []

# For each categorical column, compute the distribution of values by cluster
for col in categorical_columns:
    distribution = user_data.group_by('cluster').agg(
        [
            pl.col(col).filter(pl.col(col).is_not_null()).count().alias(f'{col}_count')
        ]
    )
    category_distribution.append(distribution)


category_distribution

[shape: (3, 2)
 ┌─────────┬──────────────┐
 │ cluster ┆ smoker_count │
 │ ---     ┆ ---          │
 │ i32     ┆ u32          │
 ╞═════════╪══════════════╡
 │ 0       ┆ 1544         │
 │ 1       ┆ 1455         │
 │ 2       ┆ 1114         │
 └─────────┴──────────────┘,
 shape: (3, 2)
 ┌─────────┬───────────────────┐
 │ cluster ┆ drink_level_count │
 │ ---     ┆ ---               │
 │ i32     ┆ u32               │
 ╞═════════╪═══════════════════╡
 │ 0       ┆ 1575              │
 │ 1       ┆ 1455              │
 │ 2       ┆ 1114              │
 └─────────┴───────────────────┘,
 shape: (3, 2)
 ┌─────────┬────────────────────────┐
 │ cluster ┆ dress_preference_count │
 │ ---     ┆ ---                    │
 │ i32     ┆ u32                    │
 ╞═════════╪════════════════════════╡
 │ 0       ┆ 1513                   │
 │ 1       ┆ 1455                   │
 │ 2       ┆ 1114                   │
 └─────────┴────────────────────────┘,
 shape: (3, 2)
 ┌─────────┬────────────────┐
 │ cluster ┆ amb

## Resturants Clustering

In [48]:
restaurant_df = restaurant_df.with_columns([
    pl.when(pl.col(col).cast(str).str.to_lowercase().is_in(["?", "none"]))
    .then(None)
    .otherwise(pl.col(col))
    .alias(col)
    for col in restaurant_df.columns
])
print(len(restaurant_df))
restaurant_df.null_count()

901


placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services,Rpayment,Rcuisine,hours,days,parking_lot
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,138,93,90,156,901,417,0,429,0,0,0,780,0,0,0,802,51,213,16,16,340


In [90]:
joined_rest_df = ratings.join(restaurant_df, on="placeID", how="inner")
joined_rest_df.head()

userID,placeID,rating,food_rating,service_rating,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services,Rpayment,Rcuisine,hours,days,parking_lot
str,i64,i64,i64,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""U1077""",135085,2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""",,"""78000""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""",,"""familiar""","""f""","""closed""",,"""cash""","""Fast_Food""","""00:00-00:00;""","""Mon;Tue;Wed;Thu;Fri;""","""public"""
"""U1077""",135085,2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""",,"""78000""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""",,"""familiar""","""f""","""closed""",,"""cash""","""Fast_Food""","""00:00-00:00;""","""Sat;""","""public"""
"""U1077""",135085,2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""",,"""78000""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""",,"""familiar""","""f""","""closed""",,"""cash""","""Fast_Food""","""00:00-00:00;""","""Sun;""","""public"""
"""U1077""",135038,2,2,1,22.155651,-100.977767,"""0101000020957F0000506149736E47…","""Restaurant la Chalita""","""Guajardo Sn San Luis Potosi Ce…","""San Luis Potosi""","""SLP""","""Mexico""",,"""78000""","""No_Alcohol_Served""","""section""","""informal""","""no_accessibility""","""medium""",,"""familiar""","""f""","""closed""",,"""cash""",,"""08:00-17:00;""","""Mon;Tue;Wed;Thu;Fri;""",
"""U1077""",135038,2,2,1,22.155651,-100.977767,"""0101000020957F0000506149736E47…","""Restaurant la Chalita""","""Guajardo Sn San Luis Potosi Ce…","""San Luis Potosi""","""SLP""","""Mexico""",,"""78000""","""No_Alcohol_Served""","""section""","""informal""","""no_accessibility""","""medium""",,"""familiar""","""f""","""closed""",,"""cash""",,"""08:00-17:00;""","""Sat;""",


In [91]:
restaurant_data = joined_rest_df.drop(["fax", "zip", "url", "Rpayment","days", "area", "hours", "other_services", "Rpayment", "placeID", "userID"])
restaurant_data.head()

rating,food_rating,service_rating,latitude,longitude,the_geom_meter,name,address,city,state,country,alcohol,smoking_area,dress_code,accessibility,price,Rambience,franchise,Rcuisine,parking_lot
i64,i64,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""","""familiar""","""f""","""Fast_Food""","""public"""
2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""","""familiar""","""f""","""Fast_Food""","""public"""
2,2,2,22.150802,-100.98268,"""0101000020957F00009F823DA60948…","""Tortas Locas Hipocampo""","""Venustiano Carranza 719 Centro""","""San Luis Potosi""","""SLP""","""Mexico""","""No_Alcohol_Served""","""not permitted""","""informal""","""no_accessibility""","""medium""","""familiar""","""f""","""Fast_Food""","""public"""
2,2,1,22.155651,-100.977767,"""0101000020957F0000506149736E47…","""Restaurant la Chalita""","""Guajardo Sn San Luis Potosi Ce…","""San Luis Potosi""","""SLP""","""Mexico""","""No_Alcohol_Served""","""section""","""informal""","""no_accessibility""","""medium""","""familiar""","""f""",,
2,2,1,22.155651,-100.977767,"""0101000020957F0000506149736E47…","""Restaurant la Chalita""","""Guajardo Sn San Luis Potosi Ce…","""San Luis Potosi""","""SLP""","""Mexico""","""No_Alcohol_Served""","""section""","""informal""","""no_accessibility""","""medium""","""familiar""","""f""",,


In [98]:
categorical_rest_cols = [
    "the_geom_meter","name", "address", "city", "state", "country",
    "alcohol", "smoking_area", "dress_code", "accessibility", "price", "Rambience",
    "franchise", "Rcuisine", "parking_lot", "rating",
    "food_rating", "service_rating", "the_geom_meter"
]

# One-hot encode
restaurant_data_dummies = restaurant_data.to_dummies(columns=categorical_rest_cols)

In [99]:
rest_df_pd = restaurant_data_dummies.to_pandas()

# Scale the data
scaler = StandardScaler()
rest_df_scaled = scaler.fit_transform(rest_df_pd)

In [112]:
# Calculate inertia for k from 1 to 10
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rest_df_scaled)
    inertia.append(kmeans.inertia_)

# Create a DataFrame for Plotly
elbow_df = pd.DataFrame({
    "k": list(k_range),
    "inertia": inertia
})

# Create interactive elbow plot
fig = px.line(elbow_df, x="k", y="inertia", markers=True,
              title="Elbow Method for Optimal k",
              labels={"k": "Number of Clusters", "inertia": "Inertia"})

fig.update_layout(xaxis=dict(dtick=1), template="plotly_white")
fig.show()

In [119]:
num_clusters = 9
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(rest_df_scaled)

# Add cluster labels to the original Polars DataFrame
restaurant_data = restaurant_data.with_columns(pl.Series("cluster", clusters))


In [120]:
fig = px.histogram(restaurant_data, x="cluster", title="Restaurant Count by Cluster Segment")
fig.show()

In [121]:
rest_segment_summary = restaurant_data.group_by('cluster').agg([
    pl.mean('latitude').alias('mean_latitude'),
    pl.mean('longitude').alias('mean_longitude'),
    pl.len().alias('n_users')
])

rest_segment_summary

cluster,mean_latitude,mean_longitude,n_users
i32,f64,f64,u32
5,22.030034,-100.838244,783
6,22.149709,-100.976093,90
0,22.154687,-100.996617,153
7,22.144979,-101.005683,90
8,22.147145,-100.974494,96
2,21.986916,-100.698592,6943
4,18.925773,-99.232636,60
1,22.147175,-100.974269,72
3,22.142273,-100.942654,15


## Model Selection

In [None]:
# Convert Polars to Pandas
customer_data_transformed_df = customer_data_transformed.to_pandas()

# Encode target variable
label_encoder = LabelEncoder()
customer_data_transformed_df['made_purchase'] = label_encoder.fit_transform(
    customer_data_transformed_df['made_purchase']
)

In [None]:
# Define predictors and target
features = ['most_recent_transaction', 'log_number_of_transactions', 'log_total_revenue']
target = 'made_purchase'

# Preprocessing
categorical_features = []
numeric_features = features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [None]:
# --- Split Data ---
train, test = train_test_split(customer_data_transformed_df, test_size=0.3, random_state=101)

# --- Define Models ---
models = {
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=101),
    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=101)
}

In [None]:
accuracy_scores = {}

# --- Train and Evaluate Models ---
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(train[features], train[target])
    test['pred_purchase'] = pipeline.predict(test[features])
    
    # Metrics
    target_names = [str(name) for name in label_encoder.classes_]
    conf_matrix = confusion_matrix(test[target], test['pred_purchase'])
    accuracy = accuracy_score(test[target], test['pred_purchase'])
    accuracy_scores[model_name] = accuracy

    print(f"\nModel: {model_name}")
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:", accuracy)

In [None]:
# --- Select Best Model ---
best_model_name = max(accuracy_scores, key=accuracy_scores.get)
best_accuracy_score = accuracy_scores[best_model_name]
best_model = models[best_model_name]

print(f"\n✅ Best Model: {best_model_name} (Accuracy: {best_accuracy_score:.4f})")

# --- Retrain Best Model and Predict Probabilities ---
best_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])
best_model_pipeline.fit(train[features], train[target])

In [None]:
# Predictions
predicted_outcome = best_model_pipeline.predict(test[features])
predicted_prob = best_model_pipeline.predict_proba(test[features])[:, 1]

# Add to test set
test['pred_purchase'] = predicted_outcome
test['pred_probability'] = predicted_prob

In [None]:
# Convert back to Polars DataFrame
customer_test = pl.DataFrame(test)

# View the first few rows
print(customer_test.head())