## 1. Read data

In [1]:
business_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group_5_final_project/store_df/")
business_df.show(3)

25/12/01 18:18:11 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+--------------+-----+--------------------+-------------------+--------------------+---------------+--------------------+---------------+-----------------+--------------------+----------------------+-----------------+----------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-----+--------------+---------+---------------+------------------------+----------------+-------------+-----------------------+------------------+------------+
|             gmap_id|             address|avg_rating|            category|         description|               hours|          latitude|         longitude|          store_name|num_of_reviews|price|    relative_results|              state|  MISC_Accessibility|MISC_Acti

                                                                                

In [2]:
print(business_df.count())



169478


                                                                                

In [3]:
from pyspark.sql import functions as F

# =========================
# Parameters for shrinkage
# =========================
k = 10  # pseudo-count (adjust this to control shrinkage strength)

# global mean of predicted sentiment across all stores
global_mean = business_df.agg(F.mean("avg_predicted_sentiment")).collect()[0][0]

# =========================
# Compute shrunk sentiment
# =========================
business_df = business_df.withColumn(
    "shrunk_sentiment",
    (F.col("avg_predicted_sentiment") * F.col("review_count") + global_mean * k) /
    (F.col("review_count") + k)
)

# =========================
# Show results
# =========================
business_df.select(
    "store_name",
    "avg_predicted_sentiment",
    "review_count",
    "shrunk_sentiment",
    "avg_rating"
).show(10, truncate=False)


                                                                                

+-------------------------------+-----------------------+------------+------------------+----------+
|store_name                     |avg_predicted_sentiment|review_count|shrunk_sentiment  |avg_rating|
+-------------------------------+-----------------------+------------+------------------+----------+
|Checkers                       |3.1702048057536985     |11          |3.6295623775899735|3.5       |
|Momo Asian Fusion              |4.58498825475099       |5           |4.28489988932358  |4.6       |
|Rosso Pizzeria & Mozzarella Bar|3.864661623144518      |5           |4.044791012121423 |4.2       |
|Junior's Restaurant & Bakery   |4.290628785509187      |15          |4.228319553949462 |4.4       |
|Woori Village                  |3.1132834520223933     |10          |3.624069579316134 |3.5       |
|The Flame Broiler              |4.007555898949703      |5           |4.092422437389818 |3.8       |
|200 Fifth                      |4.116263523339339      |14          |4.124010266368729 |4.

## 2. Checking missing value

In [4]:
from pyspark.sql.functions import col, sum as spark_sum, lit
from pyspark.sql import Row

# Step 1: compute missing counts (one row)
missing_df = business_df.select([
    spark_sum(col(c).isNull().cast("int")).alias(c)
    for c in business_df.columns
])

# Step 2: collect as Python dict
missing_dict = missing_df.first().asDict()

# Step 3: convert to list of Rows
rows = [Row(column=col_name, missing_count=missing_dict[col_name])
        for col_name in missing_dict]

# Step 4: create a transposed Spark DataFrame
transposed_missing_df = spark.createDataFrame(rows)

# Step 5: show results
transposed_missing_df.show(50, truncate=False)



+------------------------+-------------+
|column                  |missing_count|
+------------------------+-------------+
|gmap_id                 |0            |
|address                 |0            |
|avg_rating              |0            |
|category                |0            |
|description             |72284        |
|hours                   |8333         |
|latitude                |0            |
|longitude               |0            |
|store_name              |0            |
|num_of_reviews          |0            |
|price                   |47163        |
|relative_results        |12725        |
|state                   |46186        |
|MISC_Accessibility      |33726        |
|MISC_Activities         |169478       |
|MISC_Amenities          |26983        |
|MISC_Atmosphere         |38927        |
|MISC_Crowd              |52454        |
|MISC_Dining_options     |50973        |
|MISC_From_the_business  |163447       |
|MISC_Getting_here       |169478       |
|MISC_Health_and

                                                                                

In [5]:
cols_to_drop = [
    "MISC_Activities", "MISC_From_the_business", "MISC_Getting_here",
    "MISC_Health_and_safety", "MISC_Lodging_options", "MISC_Recycling"
    "review_count", 
]

business_df_clean = business_df.drop(*cols_to_drop)

In [6]:
from pyspark.sql.types import DoubleType

business_df_clean = business_df_clean.withColumn("irs_estimated_population", col("irs_estimated_population").cast(DoubleType()))

#### One-hot encoding top 3 categories for each categorical column

In [7]:
from pyspark.sql.functions import explode, col, when, array_contains

misc_cols = [
    "MISC_Accessibility", "MISC_Amenities", "MISC_Atmosphere",
    "MISC_Crowd", "MISC_Dining_options", "MISC_Offerings",
    "MISC_Payments", "MISC_Planning", "MISC_Popular_for",
    "MISC_Service_options", "MISC_Highlights"
]

for c in misc_cols:
    # Explode the array to count frequency
    top_items = (business_df_clean
                 .select(explode(col(c)).alias("item"))
                 .groupBy("item")
                 .count()
                 .orderBy(col("count").desc())
                 .limit(3)
                 .collect())
    
    top_items_list = [row["item"] for row in top_items if row["item"] is not None]

    # Create one-hot columns for top 3
    for item in top_items_list:
        col_name = f"{c}_{item.replace(' ', '_')}_flag"
        business_df_clean = business_df_clean.withColumn(
            col_name,
            when(array_contains(col(c), item), 1).otherwise(0)
        )

                                                                                

In [8]:
business_df_clean.show(3)

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+--------------+-----+--------------------+-------------------+--------------------+--------------------+---------------+-----------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-----+--------------+---------+---------------+------------------------+----------------+-------------+-----------------------+------------------+------------+------------------+------------------------------------------------------+------------------------------------------------------+---------------------------------------------------------+---------------------------------+-------------------------------+----------------------------+---------------------------+-------------------------+-----------------------------+----

#### Compare model performance on predicting closure

In [9]:
from pyspark.sql.functions import col, when, isnan
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# -----------------------------------------------------
# 1. Feature Columns
# -----------------------------------------------------
feature_cols = [
    "avg_rating", 
    "price_numeric", 
    "num_of_reviews", 
    "irs_estimated_population",
    "shrunk_sentiment", 
    "sentiment_std",
]

one_hot_cols = [c for c in business_df_clean.columns if c.endswith("_flag")] 
feature_cols += one_hot_cols

# -----------------------------------------------------
# 2. Check for missing columns (safety)
# -----------------------------------------------------
missing = [c for c in feature_cols if c not in business_df_clean.columns]
if missing:
    raise Exception(f"Missing required feature columns: {missing}")

# -----------------------------------------------------
# 3. Clean NaN / Null values
# -----------------------------------------------------
for c in feature_cols:
    business_df_clean = business_df_clean.withColumn(
        c, when(isnan(col(c)) | col(c).isNull(), 0).otherwise(col(c))
    )

# -----------------------------------------------------
# 4. Compute class weights
# -----------------------------------------------------
counts = business_df_clean.groupBy("permanent_closed").count().collect()
count_0 = next(r['count'] for r in counts if r['permanent_closed'] == 0)
count_1 = next(r['count'] for r in counts if r['permanent_closed'] == 1)

majority = max(count_0, count_1)
minority = min(count_0, count_1)

weight_for_0 = majority / count_0
weight_for_1 = majority / count_1

business_df_clean = business_df_clean.withColumn(
    "classWeight",
    when(col("permanent_closed") == 0, weight_for_0).otherwise(weight_for_1)
)

print("Class Weights:", weight_for_0, weight_for_1)
print("Class counts:", count_0, count_1)

# -----------------------------------------------------
# 5. Train/test split
# -----------------------------------------------------
train_df, test_df = business_df_clean.randomSplit([0.8, 0.2], seed=42)

# -----------------------------------------------------
# 6. Assemble features
# -----------------------------------------------------
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="keep"   # <---- prevents schema errors
)

# -----------------------------------------------------
# 7. Evaluators
# -----------------------------------------------------
evaluator_auc = BinaryClassificationEvaluator(
    labelCol="permanent_closed", metricName="areaUnderROC")
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="permanent_closed", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="permanent_closed", metricName="f1")
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol="permanent_closed", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="permanent_closed", metricName="weightedRecall")

def evaluate_model(predictions, name="Model"):
    auc = evaluator_auc.evaluate(predictions)
    acc = evaluator_acc.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)
    precision = evaluator_precision.evaluate(predictions)
    recall = evaluator_recall.evaluate(predictions)

    print(f"\n===== {name} Evaluation =====")
    print(f"AUC:       {auc:.4f}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")


Class Weights: 1.0 6.365726454865487
Class counts: 146469 23009


#### Logistic regression

In [10]:
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# ============================================================
# 0. Handle class imbalance (recommended: class weights)
# ============================================================

# Count classes
counts = train_df.groupBy("permanent_closed").count().collect()
count_0 = [r["count"] for r in counts if r["permanent_closed"] == 0][0]
count_1 = [r["count"] for r in counts if r["permanent_closed"] == 1][0]

# Assign weights: majority = 1, minority = ratio
ratio = count_0 / count_1
train_df_balanced = train_df.withColumn(
    "weight",
    when(col("permanent_closed") == 1, ratio).otherwise(1.0)
)

print(f"Class 0: {count_0}, Class 1: {count_1}, Weight ratio: {ratio:.2f}")

# ============================================================
# 1. Logistic Regression with CV
# ============================================================

lr = LogisticRegression(
    labelCol="permanent_closed",
    featuresCol="features",
    weightCol="weight",   # ⬅ KEY FIX FOR IMBALANCE
    maxIter=25
)

lr_pipeline = Pipeline(stages=[assembler, lr])

paramGrid_lr = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.0, 0.01, 0.1])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

cv_lr = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=paramGrid_lr,
    evaluator=evaluator_auc,
    numFolds=3,
    parallelism=2
)

cv_model_lr = cv_lr.fit(train_df_balanced)
predictions_lr = cv_model_lr.transform(test_df)

evaluate_model(predictions_lr, "Logistic Regression (Weighted)")

# ============================================================
# 2. Confusion Matrix
# ============================================================

predictionAndLabels = predictions_lr.select("prediction", "permanent_closed") \
                                   .rdd.map(lambda r: (float(r["prediction"]), float(r["permanent_closed"])))

metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()

conf_df = pd.DataFrame(
    conf_matrix,
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

print("\n===== Confusion Matrix =====")
print(conf_df)

# ============================================================
# 3. Feature Importance (LR coefficients)
# ============================================================

lr_model = cv_model_lr.bestModel.stages[-1]

feature_importance = pd.DataFrame({
    "feature": feature_cols,
    "coefficient": lr_model.coefficients
})

feature_importance["abs_coeff"] = feature_importance["coefficient"].abs()
feature_importance = feature_importance.sort_values("abs_coeff", ascending=False)

print("\n===== Logistic Regression Feature Importance =====")
print(feature_importance[["feature", "coefficient"]].to_string(index=False))


                                                                                

Class 0: 117368, Class 1: 18444, Weight ratio: 6.36


25/12/01 18:19:01 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/12/01 18:19:01 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                


===== Logistic Regression (Weighted) Evaluation =====
AUC:       0.9566
Accuracy:  0.8944
F1 Score:  0.9043
Precision: 0.9309
Recall:    0.8944





===== Confusion Matrix =====
          Predicted 0  Predicted 1
Actual 0      25814.0       3261.0
Actual 1        302.0       4356.0

===== Logistic Regression Feature Importance =====
                                                  feature   coefficient
                              MISC_Atmosphere_Casual_flag -1.499952e+00
                        MISC_Service_options_Takeout_flag -1.430408e+00
                        MISC_Popular_for_Solo_dining_flag -1.269532e+00
                        MISC_Amenities_Good_for_kids_flag  1.262651e+00
                           MISC_Offerings_Quick_bite_flag -1.238925e+00
                        MISC_Service_options_Dine-in_flag -1.224021e+00
                           MISC_Payments_Debit_cards_flag  1.159287e+00
       MISC_Planning_Dinner_reservations_recommended_flag -1.066499e+00
                          MISC_Payments_Credit_cards_flag -1.044560e+00
                       MISC_Service_options_Delivery_flag  1.034298e+00
                   MI

                                                                                

#### Random Forest

In [11]:
from pyspark.sql import SparkSession
import logging

# If you haven't already, create Spark session
spark = SparkSession.builder.getOrCreate()

# Set Spark log level to WARN or ERROR (suppress INFO and repeated WARN)
spark.sparkContext.setLogLevel("ERROR")

# Optionally, suppress some specific loggers
log4jLogger = spark._jvm.org.apache.log4j
log4jLogger.LogManager.getLogger("org.apache.spark.scheduler.DAGScheduler").setLevel(log4jLogger.Level.ERROR)
log4jLogger.LogManager.getLogger("org.apache.spark.storage.BlockManager").setLevel(log4jLogger.Level.ERROR)


In [12]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# -------------------------
# Random Forest + Pipeline
# -------------------------
rf = RandomForestClassifier(
    labelCol="permanent_closed",
    featuresCol="features",
    weightCol="weight"
)

rf_pipeline = Pipeline(stages=[assembler, rf])

paramGrid_rf = (ParamGridBuilder()
                .addGrid(rf.maxDepth, [5, 10])
                .addGrid(rf.numTrees, [50, 100])
                .build())

cv_rf = CrossValidator(
    estimator=rf_pipeline,
    estimatorParamMaps=paramGrid_rf,
    evaluator=evaluator_auc,
    numFolds=3,
    parallelism=2
)

cv_model_rf = cv_rf.fit(train_df_balanced)
predictions_rf = cv_model_rf.transform(test_df)

evaluate_model(predictions_rf, "Random Forest (Weighted)")

predictionAndLabels = predictions_rf.select("prediction", "permanent_closed") \
                                   .rdd.map(lambda row: (float(row['prediction']), float(row['permanent_closed'])))

metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()

print("\n===== Random Forest Confusion Matrix =====")
conf_df = pd.DataFrame(conf_matrix, 
                       index=["Actual 0", "Actual 1"], 
                       columns=["Predicted 0", "Predicted 1"])
print(conf_df)

rf_model = cv_model_rf.bestModel.stages[-1]

feature_importance = pd.DataFrame({
    "feature": feature_cols,
    "importance": rf_model.featureImportances.toArray()
}).sort_values("importance", ascending=False)

print("\n===== Random Forest Feature Importance =====")
print(feature_importance.to_string(index=False))


                                                                                


===== Random Forest (Weighted) Evaluation =====
AUC:       0.9694
Accuracy:  0.9107
F1 Score:  0.9185
Precision: 0.9411
Recall:    0.9107


[Stage 2647:==>                                                   (1 + 22) / 23]


===== Random Forest Confusion Matrix =====
          Predicted 0  Predicted 1
Actual 0      26226.0       2849.0
Actual 1        164.0       4494.0

===== Random Forest Feature Importance =====
                                                  feature  importance
                        MISC_Popular_for_Solo_dining_flag    0.165834
                              MISC_Atmosphere_Casual_flag    0.157704
                           MISC_Offerings_Quick_bite_flag    0.127008
                        MISC_Service_options_Takeout_flag    0.099606
                             MISC_Popular_for_Dinner_flag    0.057428
                                           num_of_reviews    0.055458
                              MISC_Popular_for_Lunch_flag    0.054846
                        MISC_Service_options_Dine-in_flag    0.049832
                                   MISC_Crowd_Groups_flag    0.037430
   MISC_Accessibility_Wheelchair_accessible_entrance_flag    0.033213
                                 MI

                                                                                

#### Gradient-Boosted

In [13]:
import pandas as pd
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics

# ---------------------------------------------------------
# 0. Handle class imbalance (recommended for GBT)
# ---------------------------------------------------------
# Compute class weights (inverse frequency)
counts = train_df.groupBy("permanent_closed").count().toPandas()
n0 = counts[counts["permanent_closed"] == 0]["count"].values[0]
n1 = counts[counts["permanent_closed"] == 1]["count"].values[0]

minority_weight = max(n0, n1) / min(n0, n1)

# Attach weight column
train_df = train_df.withColumn(
    "weight",
    (1.0 * (train_df.permanent_closed == 1).cast("int")) * minority_weight +
    (1.0 * (train_df.permanent_closed == 0).cast("int"))
)

# ---------------------------------------------------------
# 1. Gradient-Boosted Trees CV
# ---------------------------------------------------------
gbt = GBTClassifier(
    labelCol="permanent_closed",
    featuresCol="features",
    weightCol="weight",
    maxIter=50,   # upper bound, CV will tune
    maxDepth=5    # upper bound, CV will tune
)

gbt_pipeline = Pipeline(stages=[assembler, gbt])

paramGrid_gbt = (ParamGridBuilder()
                 .addGrid(gbt.maxDepth, [3, 5])
                 .addGrid(gbt.maxIter, [20, 50])
                 .build())

cv_gbt = CrossValidator(
    estimator=gbt_pipeline,
    estimatorParamMaps=paramGrid_gbt,
    evaluator=evaluator_auc,
    numFolds=3,
    parallelism=2
)

cv_model_gbt = cv_gbt.fit(train_df)
predictions_gbt = cv_model_gbt.transform(test_df)

evaluate_model(predictions_gbt, "Gradient-Boosted Trees")


# ---------------------------------------------------------
# 2. Confusion Matrix
# ---------------------------------------------------------
predictionAndLabels = predictions_gbt.select("prediction", "permanent_closed") \
    .rdd.map(lambda row: (float(row["prediction"]), float(row["permanent_closed"])))

metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()

print("\n===== Gradient-Boosted Trees Confusion Matrix =====")
conf_df = pd.DataFrame(
    conf_matrix,
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print(conf_df)


# ---------------------------------------------------------
# 3. Feature Importance
# ---------------------------------------------------------
gbt_model = cv_model_gbt.bestModel.stages[-1]

feature_importance = pd.DataFrame({
    "feature": feature_cols,
    "importance": gbt_model.featureImportances.toArray()
}).sort_values("importance", ascending=False)

print("\n===== Gradient-Boosted Trees Feature Importance =====")
pd.set_option("display.max_rows", None)
print(feature_importance.to_string(index=False))


                                                                                


===== Gradient-Boosted Trees Evaluation =====
AUC:       0.9699
Accuracy:  0.9164
F1 Score:  0.9233
Precision: 0.9428
Recall:    0.9164





===== Gradient-Boosted Trees Confusion Matrix =====
          Predicted 0  Predicted 1
Actual 0      26442.0       2633.0
Actual 1        187.0       4471.0

===== Gradient-Boosted Trees Feature Importance =====
                                                  feature  importance
                        MISC_Popular_for_Solo_dining_flag    0.444756
                        MISC_Service_options_Takeout_flag    0.118190
                                           num_of_reviews    0.084436
                              MISC_Atmosphere_Casual_flag    0.064586
                        MISC_Amenities_Good_for_kids_flag    0.039162
                        MISC_Service_options_Dine-in_flag    0.037595
   MISC_Accessibility_Wheelchair_accessible_entrance_flag    0.025374
                         MISC_Offerings_Comfort_food_flag    0.021081
                   MISC_Payments_NFC_mobile_payments_flag    0.020428
                                            price_numeric    0.015760
                 

                                                                                

#### Interpret feature importance and direction

In [14]:
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression

# ============================================================
# 0. Prepare features for Elastic Net Logistic Regression
# ============================================================

assembler_lasso = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_lasso"
)

# Remove old features_lasso column if it exists
if "features_lasso" in train_df.columns:
    train_df = train_df.drop("features_lasso")
if "features_lasso" in test_df.columns:
    test_df = test_df.drop("features_lasso")

train_df_lasso = assembler_lasso.transform(train_df)
test_df_lasso = assembler_lasso.transform(test_df)

# Standardize numeric features
scaler = StandardScaler(inputCol="features_lasso", outputCol="features_scaled", withMean=True, withStd=True)
scaler_model = scaler.fit(train_df_lasso)
train_df_scaled = scaler_model.transform(train_df_lasso)
test_df_scaled = scaler_model.transform(test_df_lasso)

# ============================================================
# 1. Train Elastic Net Logistic Regression (for direction)
# ============================================================

elastic_lr = LogisticRegression(
    featuresCol="features_scaled",
    labelCol="permanent_closed",
    elasticNetParam=0.2,   # Elastic Net (mix of Lasso & Ridge)
    regParam=0.01,
    maxIter=50
)

elastic_model = elastic_lr.fit(train_df_scaled)

# ============================================================
# 2. Extract coefficients with direction
# ============================================================

elastic_coef = pd.DataFrame({
    "feature": feature_cols,
    "LR_Coefficient": elastic_model.coefficients.toArray()
})
elastic_coef["abs_coef"] = elastic_coef["LR_Coefficient"].abs()

# ============================================================
# 3. Gradient-Boosted Trees feature importance
# ============================================================

gbt_importances = gbt_model.featureImportances.toArray()
if len(gbt_importances) != len(feature_cols):
    usable_cols = feature_cols[:len(gbt_importances)]
else:
    usable_cols = feature_cols

feature_importance_gbt = pd.DataFrame({
    "feature": usable_cols,
    "GBT_Importance": gbt_importances
}).sort_values("GBT_Importance", ascending=False)

# ============================================================
# 4. Merge GBT importance with LR coefficients
# ============================================================

final_df = feature_importance_gbt.merge(
    elastic_coef,
    on="feature",
    how="left"
)

# ============================================================
# 5. Determine effect direction
# ============================================================

def determine_effect(coef):
    if pd.isna(coef) or abs(coef) < 0.01:
        return "Neutral / Not Selected"
    elif coef > 0.15:
        return "↑ Strong Failure Driver"
    elif coef > 0:
        return "↑ Weak Failure Driver"
    elif coef < -0.15:
        return "↓ Strong Survival Driver"
    else:
        return "↓ Weak Survival Driver"

final_df["Closure_Effect_Direction"] = final_df["LR_Coefficient"].apply(determine_effect)

# ============================================================
# 6. Select top N features by GBT importance
# ============================================================

top_n = 50
top_features = final_df.head(top_n).copy()

# ============================================================
# 7. Clean feature names for presentation
# ============================================================

top_features["Feature_Clean"] = (
    top_features["feature"]
    .str.replace("MISC_", "", regex=False)
    .str.replace("_flag", "", regex=False)
    .str.replace("_", " ", regex=False)
)

# ============================================================
# 8. Final presentation table
# ============================================================

final_cols = [
    "Feature_Clean",
    "GBT_Importance",
    "LR_Coefficient",
    "Closure_Effect_Direction"
]

print("\n#############################################################")
print("###  Top Predictors with GBT Importance and Direction ###")
print("#############################################################\n")

print(top_features[final_cols].to_string(index=False))


                                                                                


#############################################################
###  Top Predictors with GBT Importance and Direction ###
#############################################################

                                  Feature_Clean  GBT_Importance  LR_Coefficient Closure_Effect_Direction
                        Popular for Solo dining        0.444756       -0.468713 ↓ Strong Survival Driver
                        Service options Takeout        0.118190       -0.313002 ↓ Strong Survival Driver
                                 num of reviews        0.084436       -0.559200 ↓ Strong Survival Driver
                              Atmosphere Casual        0.064586       -0.443714 ↓ Strong Survival Driver
                        Amenities Good for kids        0.039162        0.405848  ↑ Strong Failure Driver
                        Service options Dine-in        0.037595       -0.500027 ↓ Strong Survival Driver
   Accessibility Wheelchair accessible entrance        0.025374       -0.314937 ↓

#### predict all dataset and store the result

In [39]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# 1️⃣ Transform all data
all_predictions = cv_model_rf.bestModel.transform(business_df_clean)

# 2️⃣ Convert SparseVector probability to array
all_predictions = all_predictions.withColumn(
    "prob_array",
    vector_to_array(col("business_closure_prob"))
)

# 3️⃣ Extract the probability of closure (class 1)
all_predictions = all_predictions.withColumn(
    "closure_prob",
    col("prob_array")[1]  # now safe
)

# 4️⃣ Select the final columns
biz_closure_prob_df = all_predictions.select(
    "gmap_id",
    "store_name",
    "prediction",
    "closure_prob"
)

# 5️⃣ Show results
biz_closure_prob_df.show(10, truncate=False)


+-------------------------------------+-------------------------------+----------+--------------------+
|gmap_id                              |store_name                     |prediction|closure_prob        |
+-------------------------------------+-------------------------------+----------+--------------------+
|0x89e85a90fa80ea79:0x2f56cdd1f58118f |Checkers                       |0.0       |0.007217116418191323|
|0x89c2c219189e5189:0x684a238fa71eb176|Momo Asian Fusion              |0.0       |0.017988535282472868|
|0x8085b6a8ebae49f1:0x9f24861c2d643f9a|Rosso Pizzeria & Mozzarella Bar|1.0       |0.9591585341045397  |
|0x89c258545813c6bf:0x8ee1343834123591|Junior's Restaurant & Bakery   |1.0       |0.5504999064554688  |
|0x880fc80bb9076013:0xf73fd28c4d61b7ae|Woori Village                  |0.0       |0.18938758140253537 |
|0x80dce7e4827a029f:0xcb9497eb98076b9d|The Flame Broiler              |0.0       |0.05205610831563328 |
|0x89c25baa8b6fb9c9:0x5c4f2f36d6850943|200 Fifth                

In [40]:
biz_closure_prob_df.write.mode("overwrite").parquet("gs://msca-bdp-student-gcs/Group_5_final_project/biz_closure_prob_df/") 

                                                                                

#### EDA

In [15]:
from pyspark.sql.functions import col, count, round

# Basic statistics
business_df.select("avg_rating").describe().show()


# Optional: Bin ratings into ranges (e.g., 1-2, 2-3, 3-4, 4-5)
from pyspark.sql.functions import when

from pyspark.sql.functions import when, col

business_df = business_df.withColumn(
    "rating_bin",
    when(col("avg_predicted_sentiment") < 1.5, "1-1.5") \
    .when(col("avg_predicted_sentiment") < 2, "1.5-2") \
    .when(col("avg_predicted_sentiment") < 2.5, "2-2.5") \
    .when(col("avg_predicted_sentiment") < 3, "2.5-3") \
    .when(col("avg_predicted_sentiment") < 3.5, "3-3.5") \
    .when(col("avg_predicted_sentiment") < 4, "3.5-4") \
    .when(col("avg_predicted_sentiment") < 4.5, "4-4.5") \
    .otherwise("4.5-5")
)


business_df.groupBy("rating_bin").agg(count("*").alias("count")) \
               .orderBy("rating_bin") \
               .show()

                                                                                

+-------+------------------+
|summary|        avg_rating|
+-------+------------------+
|  count|            169478|
|   mean| 4.199176294268276|
| stddev|0.4117197819420847|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+





+----------+-----+
|rating_bin|count|
+----------+-----+
|     1.5-2|   42|
|     2-2.5|  486|
|     2.5-3| 2874|
|     3-3.5|12928|
|     3.5-4|40745|
|     4-4.5|75402|
|     4.5-5|37001|
+----------+-----+



                                                                                

In [16]:
combined_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group_5_final_project/combined_rest_df/") 
combined_df.show(3)

+--------------------+--------------+------+----+--------------------+-------------+--------------------+--------------------+----------+--------------------+-----------+--------------------+------------------+------------------+--------------------+--------------+-----+--------------------+------------------+------------------+---------------+---------------+---------------+--------------------+-------------------+----------------------+-----------------+----------------------+---------------+--------------------+--------------------+-------------+-------------+--------------------+--------------+--------------------+-----+--------+---------+-------------+------------------------+
|             gmap_id|     cust_name|rating|resp|                text|         time|             user_id|             address|avg_rating|            category|description|               hours|          latitude|         longitude|          store_name|num_of_reviews|price|    relative_results|             stat

In [17]:
from pyspark.sql.functions import when, col
combined_df = combined_df.withColumn(
    "permanent_closed",
    when(col("state").like("%Permanently closed%"), 1).otherwise(0)
)

In [18]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import re

#### Most common tokens for restaurant

In [24]:
from pyspark.sql import functions as F

# Filter reviews for closed stores
closed_reviews = english_df.filter(F.col("permanent_closed") == 1)

# Explode english_tokens into separate rows
closed_tokens = closed_reviews.select(F.explode(F.col("english_tokens")).alias("token"))

# Count token frequency
closed_token_counts = closed_tokens.groupBy("token").count().orderBy(F.desc("count"))

# Show top 20 most common tokens
closed_token_counts.show(50, truncate=False)




+----------+------+
|token     |count |
+----------+------+
|food      |431697|
|great     |330406|
|good      |305473|
|place     |203824|
|service   |173118|
|like      |99338 |
|best      |98113 |
|nice      |92601 |
|really    |85097 |
|love      |84047 |
|go        |81500 |
|one       |78695 |
|get       |76827 |
|friendly  |75135 |
|staff     |73305 |
|time      |69861 |
|always    |65960 |
|delicious |64934 |
|back      |61985 |
|restaurant|60099 |
|chicken   |58369 |
|excellent |53132 |
|pizza     |52727 |
|order     |51318 |
|also      |50503 |
|even      |49435 |
|little    |48366 |
|ordered   |48013 |
|definitely|47579 |
|got       |47344 |
|amazing   |45921 |
|never     |45329 |
|eat       |43611 |
|come      |43357 |
|try       |42644 |
|recommend |41082 |
|us        |40773 |
|fresh     |39197 |
|came      |38897 |
|menu      |38123 |
|went      |36914 |
|people    |35815 |
|first     |33765 |
|pretty    |33491 |
|well      |33152 |
|made      |33033 |
|awesome   |32877 |


                                                                                

#### log-odds

In [21]:
import re
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# ----------------------------
# 1. Keep only rows where both text and rating are not null
# ----------------------------
combined_df_text = combined_df.filter(
    (F.col("text").isNotNull()) & 
    (F.col("rating").isNotNull())
)

# ----------------------------
# 2. Tokenize
# ----------------------------
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenized_df = tokenizer.transform(combined_df_text)

# ----------------------------
# 3. Remove stop words
# ----------------------------
stop_remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
clean_df = stop_remover.transform(tokenized_df)

# ----------------------------
# 4. Keep only English tokens
# ----------------------------
english_udf = F.udf(
    lambda tokens: [t.lower() for t in tokens if re.fullmatch(r"[a-zA-Z]+", t)],
    ArrayType(StringType())
)

english_df = clean_df.withColumn("english_tokens", english_udf(F.col("tokens_clean")))

# ----------------------------
# 5. Filter closed and open stores
# ----------------------------
closed_df = english_df.filter(F.col("permanent_closed") == 1)
open_df = english_df.filter(F.col("permanent_closed") == 0)

# ----------------------------
# 6. Explode tokens into single rows
# ----------------------------
closed_tokens = closed_df.select(F.explode(F.col("english_tokens")).alias("token"))
open_tokens = open_df.select(F.explode(F.col("english_tokens")).alias("token"))

# ----------------------------
# 7. Count token occurrences
# ----------------------------
closed_counts = closed_tokens.groupBy("token").count().withColumnRenamed("count", "closed_count")
open_counts = open_tokens.groupBy("token").count().withColumnRenamed("count", "open_count")

# ----------------------------
# 8. Merge counts and compute log-odds ratio
# ----------------------------
token_stats = closed_counts.join(open_counts, on="token", how="outer").fillna(0)
token_stats = token_stats.withColumn(
    "log_odds_ratio",
    F.log( (F.col("closed_count") + 1) / (F.col("open_count") + 1) )
)


In [23]:
# ----------------------------
# 9. Sort by most indicative tokens for closure
# ----------------------------
top_tokens_closed = token_stats.orderBy(F.col("log_odds_ratio").desc())
top_tokens_closed.show(50, truncate=False)

# ----------------------------
# 10. Optionally, sort for tokens indicative of open stores
# ----------------------------
top_tokens_open = token_stats.orderBy(F.col("log_odds_ratio").asc())
top_tokens_open.show(50, truncate=False)


                                                                                

+----------+------------+----------+------------------+
|token     |closed_count|open_count|log_odds_ratio    |
+----------+------------+----------+------------------+
|zume      |28          |0         |3.367295829986474 |
|cruffins  |83          |2         |3.332204510175204 |
|baconslut |21          |0         |3.091042453358316 |
|hopcat    |20          |0         |3.044522437723423 |
|slyder    |17          |0         |2.8903717578961645|
|cocotero  |16          |0         |2.833213344056216 |
|lallisse  |15          |0         |2.772588722239781 |
|brezo     |15          |0         |2.772588722239781 |
|nellcote  |15          |0         |2.772588722239781 |
|tondaku   |14          |0         |2.70805020110221  |
|wildfox   |13          |0         |2.6390573296152584|
|burritt   |13          |0         |2.6390573296152584|
|cardoz    |13          |0         |2.6390573296152584|
|fifolet   |12          |0         |2.5649493574615367|
|duidough  |12          |0         |2.5649493574



+---------------+------------+----------+-------------------+
|token          |closed_count|open_count|log_odds_ratio     |
+---------------+------------+----------+-------------------+
|mcgriddle      |0           |888       |-6.790097235513905 |
|venti          |3           |2852      |-6.5698319900936095|
|playplace      |0           |706       |-6.561030665896573 |
|bussin         |0           |529       |-6.272877006546167 |
|whoppers       |4           |2584      |-6.248042874508429 |
|jitb           |0           |504       |-6.22455842927536  |
|casinos        |2           |1489      |-6.207919110271395 |
|mcmuffins      |1           |968       |-6.183117431330821 |
|pizookie       |0           |474       |-6.163314804034641 |
|culvers        |2           |1410      |-6.153441663184704 |
|mcgriddles     |0           |442       |-6.093569770045136 |
|frape          |0           |430       |-6.066108090103747 |
|distancing     |30          |12672     |-6.013241820596918 |
|bolillo

                                                                                

#### Top N-grams (phrases)

In [25]:
from pyspark.ml.feature import NGram

# -------------------------------------------------
# 1. Create bigrams & trigrams from english_tokens
# -------------------------------------------------
bigrammer = NGram(n=2, inputCol="english_tokens", outputCol="bigrams")
trigrammer = NGram(n=3, inputCol="english_tokens", outputCol="trigrams")

df_with_bigrams = bigrammer.transform(english_df)
df_with_ngrams = trigrammer.transform(df_with_bigrams)

# -------------------------------------------------
# 2. Filter open vs closed stores
# -------------------------------------------------
closed_ngram_df = df_with_ngrams.filter(F.col("permanent_closed") == 1)
open_ngram_df   = df_with_ngrams.filter(F.col("permanent_closed") == 0)

# -------------------------------------------------
# 3. Explode bigrams & trigrams
# -------------------------------------------------
closed_bigrams = closed_ngram_df.select(F.explode(F.col("bigrams")).alias("ngram"))
open_bigrams   = open_ngram_df.select(F.explode(F.col("bigrams")).alias("ngram"))

closed_trigrams = closed_ngram_df.select(F.explode(F.col("trigrams")).alias("ngram"))
open_trigrams   = open_ngram_df.select(F.explode(F.col("trigrams")).alias("ngram"))

# -------------------------------------------------
# 4. Count n-grams
# -------------------------------------------------
closed_bigram_counts = closed_bigrams.groupBy("ngram").count().withColumnRenamed("count", "closed_count")
open_bigram_counts   = open_bigrams.groupBy("ngram").count().withColumnRenamed("count", "open_count")

closed_trigram_counts = closed_trigrams.groupBy("ngram").count().withColumnRenamed("count", "closed_count")
open_trigram_counts   = open_trigrams.groupBy("ngram").count().withColumnRenamed("count", "open_count")

# -------------------------------------------------
# 5. Compute log odds ratio (like token analysis)
# -------------------------------------------------
def compute_stats(closed_df, open_df):
    stats = closed_df.join(open_df, on="ngram", how="outer").fillna(0)
    stats = stats.withColumn(
        "log_odds_ratio",
        F.log((F.col("closed_count") + 1) / (F.col("open_count") + 1))
    )
    return stats

bigram_stats  = compute_stats(closed_bigram_counts, open_bigram_counts)
trigram_stats = compute_stats(closed_trigram_counts, open_trigram_counts)

# -------------------------------------------------
# 6. Rank n-grams most indicative of closed stores
# -------------------------------------------------
top_bigrams_closed = bigram_stats.orderBy(F.col("log_odds_ratio").desc())
top_trigrams_closed = trigram_stats.orderBy(F.col("log_odds_ratio").desc())

print("\n TOP CLOSURE BIGRAMS")
top_bigrams_closed.show(50, truncate=False)

print("\n TOP CLOSURE TRIGRAMS")
top_trigrams_closed.show(50, truncate=False)

# -------------------------------------------------
# 7. Also rank open-store ngrams (healthy indicators)
# -------------------------------------------------
top_bigrams_open = bigram_stats.orderBy(F.col("log_odds_ratio").asc())
top_trigrams_open = trigram_stats.orderBy(F.col("log_odds_ratio").asc())

print("\n TOP OPEN-STORE BIGRAMS")
top_bigrams_open.show(50, truncate=False)

print("\n TOP OPEN-STORE TRIGRAMS")
top_trigrams_open.show(50, truncate=False)


 TOP CLOSURE BIGRAMS


                                                                                

+-------------------+------------+----------+------------------+
|ngram              |closed_count|open_count|log_odds_ratio    |
+-------------------+------------+----------+------------------+
|little bucharest   |32          |0         |3.4965075614664802|
|hu kitchen         |24          |0         |3.2188758248682006|
|lunar blossom      |20          |0         |3.044522437723423 |
|viper alley        |19          |0         |2.995732273553991 |
|bar agricole       |18          |0         |2.9444389791664403|
|mumbai tandoor     |37          |1         |2.9444389791664403|
|scammers scammers  |56          |2         |2.9444389791664403|
|bombay bread       |18          |0         |2.9444389791664403|
|firefly grill      |18          |0         |2.9444389791664403|
|la tabun           |17          |0         |2.8903717578961645|
|top wok            |15          |0         |2.772588722239781 |
|ringer hut         |14          |0         |2.70805020110221  |
|cafe clover        |13  

                                                                                

+-----------------------------------+------------+----------+------------------+
|ngram                              |closed_count|open_count|log_odds_ratio    |
+-----------------------------------+------------+----------+------------------+
|scammers scammers scammers         |55          |0         |4.02535169073515  |
|lincoln square steak               |18          |0         |2.9444389791664403|
|fluffy fluffy fluffy               |17          |0         |2.8903717578961645|
|uuuuu uuuuu uuuuu                  |16          |0         |2.833213344056216 |
|chilaquiles chilaquiles chilaquiles|16          |0         |2.833213344056216 |
|tell traditional burgers           |15          |0         |2.772588722239781 |
|bombay bread bar                   |14          |0         |2.70805020110221  |
|best tell traditional              |13          |0         |2.6390573296152584|
|mozzarella bar pizza               |13          |0         |2.6390573296152584|
|bar pizza e                

                                                                                

+-------------------+------------+----------+-------------------+
|ngram              |closed_count|open_count|log_odds_ratio     |
+-------------------+------------+----------+-------------------+
|love starbucks     |1           |4498      |-7.718463248281227 |
|favorite starbucks |0           |1775      |-7.4821189235521155|
|good stores        |0           |1299      |-7.170119543449628 |
|starbucks ever     |1           |2536      |-7.145590379039336 |
|worst starbucks    |1           |2358      |-7.07284589884773  |
|lines drive        |0           |1036      |-6.9440872082295275|
|slowest starbucks  |0           |1012      |-6.920671504248683 |
|double animal      |1           |1787      |-6.795705775173515 |
|sausage mcmuffin   |0           |870       |-6.769641976852503 |
|best starbucks     |3           |3252      |-6.7010385653534605|
|use mobile         |0           |795       |-6.679599185844383 |
|nice mall          |2           |2360      |-6.668228248417403 |
|another s



+--------------------------+------------+----------+-------------------+
|ngram                     |closed_count|open_count|log_odds_ratio     |
+--------------------------+------------+----------+-------------------+
|long line drive           |0           |1633      |-7.3987862754199485|
|love jack box             |0           |937       |-6.843749949006225 |
|worst starbucks ever      |0           |808       |-6.695798917058492 |
|double double animal      |1           |1545      |-6.6502790485874215|
|drive thru line           |9           |7400      |-6.606785312203421 |
|quick drive thru          |1           |1472      |-6.601909235902685 |
|drive thru never          |0           |688       |-6.535241271013659 |
|long lines drive          |0           |643       |-6.467698726104354 |
|drive thru fast           |2           |1883      |-6.4425401664681985|
|go wrong n                |0           |605       |-6.406879986069314 |
|employees wearing masks   |0           |579       

                                                                                

#### remove store name top n-grams

In [26]:
import re
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram

# ----------------------------
# 1. Keep only reviews from permanently closed stores
# ----------------------------
closed_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 1)
)


In [27]:
closed_df.count()

                                                                                

1188577

In [28]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram

# ----------------------------
# 1. Keep only reviews from permanently closed stores
# ----------------------------
closed_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 1)
)

# Optional: sample 1% for testing
closed_df = closed_df.sample(fraction=1.0, seed=42)

# ----------------------------
# 2. Tokenize
# ----------------------------
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
closed_df = tokenizer.transform(closed_df)

# ----------------------------
# 3. Remove stopwords
# ----------------------------
stop_remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
closed_df = stop_remover.transform(closed_df)

# ----------------------------
# 4. Keep only alphabetic English tokens >= 3 chars
# ----------------------------
closed_df = closed_df.withColumn(
    "english_tokens",
    F.expr("filter(transform(tokens_clean, x -> lower(x)), x -> x rlike '^[a-z]{3,}$')")
)

# ----------------------------
# 5. Remove store name words using broadcast UDF
# ----------------------------
store_words_list = (
    combined_df
    .select(F.lower(F.col("store_name")).alias("name"))
    .select(F.split("name", " ").alias("words"))
    .select(F.explode("words").alias("w"))
    .filter(F.length("w") > 2)
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)

broadcast_store_words = spark.sparkContext.broadcast(set(store_words_list))

def remove_store_words(tokens):
    return [t for t in tokens if t not in broadcast_store_words.value]

remove_store_words_udf = F.udf(remove_store_words, ArrayType(StringType()))

closed_df = closed_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))

# ----------------------------
# 6. Create trigrams
# ----------------------------
trigrammer = NGram(n=3, inputCol="operational_tokens", outputCol="trigrams")
closed_df = trigrammer.transform(closed_df)

# ----------------------------
# 7. Explode trigrams and count
# ----------------------------
trigram_exploded = closed_df.select(F.explode("trigrams").alias("ngram"))
trigram_counts = trigram_exploded.groupBy("ngram").count().withColumnRenamed("count", "closed_count")

# ----------------------------
# 8. Rank trigrams by frequency
# ----------------------------
trigram_stats = trigram_counts.orderBy(F.col("closed_count").desc())

# ----------------------------
# 9. Show top 50 trigrams
# ----------------------------
trigram_stats.show(50, truncate=False)




+-----------------------------+------------+
|ngram                        |closed_count|
+-----------------------------+------------+
|highly recommend anyone      |376         |
|definitely recommend anyone  |264         |
|highly recommend try         |208         |
|decided give try             |199         |
|ordered took minutes         |179         |
|highly recommend trying      |168         |
|highly recommend going       |165         |
|customer highly recommend    |157         |
|definitely highly recommend  |145         |
|recommend anyone wants       |143         |
|highly recommend definitely  |128         |
|loved highly recommend       |124         |
|took almost minutes          |103         |
|highly recommend checking    |102         |
|definitely recommend trying  |100         |
|definitely recommend try     |98          |
|definitely going try         |96          |
|waited minutes waitress      |93          |
|try highly recommend         |92          |
|took minu

                                                                                

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram
from functools import reduce

# ----------------------------
# 1. Filter closed and open store reviews
# ----------------------------
closed_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 1)
)

open_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 0)
)

# Optional: sample small fraction for testing
closed_df = closed_df.sample(fraction=1.0, seed=42)
open_df   = open_df.sample(fraction=1.0, seed=42)

# ----------------------------
# 2. Tokenize and remove stopwords
# ----------------------------
def clean_tokens(df):
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    df = tokenizer.transform(df)

    stop_remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
    df = stop_remover.transform(df)

    # Keep only alphabetic English tokens >= 3 chars
    df = df.withColumn(
        "english_tokens",
        F.expr("filter(transform(tokens_clean, x -> lower(x)), x -> x rlike '^[a-z]{3,}$')")
    )
    return df

closed_df = clean_tokens(closed_df)
open_df   = clean_tokens(open_df)

# ----------------------------
# 3. Remove store name words using broadcast
# ----------------------------
store_words_list = (
    combined_df
    .select(F.lower(F.col("store_name")).alias("name"))
    .select(F.split("name", " ").alias("words"))
    .select(F.explode("words").alias("w"))
    .filter(F.length("w") > 2)
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)

broadcast_store_words = spark.sparkContext.broadcast(set(store_words_list))

def remove_store_words(tokens):
    return [t for t in tokens if t not in broadcast_store_words.value]

remove_store_words_udf = F.udf(remove_store_words, ArrayType(StringType()))

closed_df = closed_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))
open_df   = open_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))

# ----------------------------
# 4. Create trigrams
# ----------------------------
trigrammer = NGram(n=3, inputCol="operational_tokens", outputCol="trigrams")
closed_df = trigrammer.transform(closed_df)
open_df   = trigrammer.transform(open_df)

# ----------------------------
# 5. Explode trigrams and count
# ----------------------------
def trigram_count(df, count_col):
    exploded = df.select(F.explode("trigrams").alias("ngram"))
    return exploded.groupBy("ngram").count().withColumnRenamed("count", count_col)

closed_counts = trigram_count(closed_df, "closed_count")
open_counts   = trigram_count(open_df, "open_count")

# ----------------------------
# 6. Combine counts for comparison
# ----------------------------
combined_counts = closed_counts.join(open_counts, on="ngram", how="outer").fillna(0)

# Compute log-odds ratio (positive = more frequent in closed stores)
combined_counts = combined_counts.withColumn(
    "log_odds_ratio",
    F.log((F.col("closed_count")+1)/(F.col("open_count")+1))
)

# ----------------------------
# 7. Rank trigrams for comparative insight
# ----------------------------
top_closed_trigrams = combined_counts.orderBy(F.col("log_odds_ratio").desc())
top_open_trigrams   = combined_counts.orderBy(F.col("log_odds_ratio").asc())



                                                                                

In [30]:
# ----------------------------
# 8. Show results
# ----------------------------
print("Top trigrams indicative of closed stores:")
top_closed_trigrams.show(50, truncate=False)

print("Top trigrams indicative of open stores:")
top_open_trigrams.show(50, truncate=False)


Top trigrams indicative of closed stores:


                                                                                

+--------------------------------+------------+----------+------------------+
|ngram                           |closed_count|open_count|log_odds_ratio    |
+--------------------------------+------------+----------+------------------+
|scammers scammers scammers      |62          |0         |4.143134726391533 |
|fluffy fluffy fluffy            |26          |0         |3.295836866004329 |
|uuuuu uuuuu uuuuu               |16          |0         |2.833213344056216 |
|spend courteous recommend       |5           |0         |1.791759469228055 |
|professional decor try          |4           |0         |1.6094379124341003|
|contact person forget           |4           |0         |1.6094379124341003|
|issue fairly fooled             |4           |0         |1.6094379124341003|
|mention recommend almost        |4           |0         |1.6094379124341003|
|customers brought hous          |4           |0         |1.6094379124341003|
|alcohol ordering issue          |4           |0         |1.6094



+---------------------------+------------+----------+-------------------+
|ngram                      |closed_count|open_count|log_odds_ratio     |
+---------------------------+------------+----------+-------------------+
|lolol lolol lolol          |0           |279       |-5.634789603169249 |
|frickin frickin frickin    |0           |236       |-5.4680601411351315|
|wearing masks gloves       |0           |201       |-5.308267697401205 |
|worst customer wait        |0           |179       |-5.19295685089021  |
|wait waited almost         |0           |155       |-5.049856007249537 |
|took minutes cars          |1           |292       |-4.987025428457122 |
|gave someone wait          |0           |145       |-4.983606621708336 |
|stopping costs reasonable  |0           |142       |-4.962844630259907 |
|ordered asked wait         |0           |140       |-4.948759890378168 |
|loved members highly       |0           |137       |-4.927253685157205 |
|forgot give asked          |0        

                                                                                

In [31]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram

# ----------------------------
# 1. Filter closed and open store reviews
# ----------------------------
closed_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 1)
)

open_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 0)
)

# Optional: sample 1% for testing
closed_df = closed_df.sample(1.0, seed=42)
open_df   = open_df.sample(1.0, seed=42)

# ----------------------------
# 2. Tokenize & clean
# ----------------------------
def clean_tokens(df):
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    df = tokenizer.transform(df)

    stop_remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
    df = stop_remover.transform(df)

    # Keep only alphabetic tokens of length >=3
    df = df.withColumn(
        "english_tokens",
        F.expr("filter(transform(tokens_clean, x -> lower(x)), x -> x rlike '^[a-z]{3,}$')")
    )

    # Filter out nonsense tokens like repeated letters
    df = df.withColumn(
        "english_tokens",
        F.expr("filter(english_tokens, x -> not x rlike '^(.)\\1+$')")  # removes 'aaaa' or 'uuu'
    )

    return df

closed_df = clean_tokens(closed_df)
open_df   = clean_tokens(open_df)

# ----------------------------
# 3. Remove store name words
# ----------------------------
store_words_list = (
    combined_df
    .select(F.lower(F.col("store_name")).alias("name"))
    .select(F.split("name", " ").alias("words"))
    .select(F.explode("words").alias("w"))
    .filter(F.length("w") > 2)
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)
broadcast_store_words = spark.sparkContext.broadcast(set(store_words_list))

def remove_store_words(tokens):
    return [t for t in tokens if t not in broadcast_store_words.value]

remove_store_words_udf = F.udf(remove_store_words, ArrayType(StringType()))

closed_df = closed_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))
open_df   = open_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))

# ----------------------------
# 4. Create trigrams
# ----------------------------
trigrammer = NGram(n=3, inputCol="operational_tokens", outputCol="trigrams")
closed_df = trigrammer.transform(closed_df)
open_df   = trigrammer.transform(open_df)

# ----------------------------
# 5. Explode trigrams and count
# ----------------------------
def trigram_count(df, count_col):
    exploded = df.select(F.explode("trigrams").alias("ngram"))
    return exploded.groupBy("ngram").count().withColumnRenamed("count", count_col)

closed_counts = trigram_count(closed_df, "closed_count")
open_counts   = trigram_count(open_df, "open_count")

# ----------------------------
# 6. Combine counts & compute log-odds
# ----------------------------
combined_counts = closed_counts.join(open_counts, on="ngram", how="outer").fillna(0)
combined_counts = combined_counts.withColumn(
    "log_odds_ratio",
    F.log((F.col("closed_count")+1)/(F.col("open_count")+1))
)

# ----------------------------
# 7. Rank for comparative insight
# ----------------------------
top_closed_trigrams = combined_counts.orderBy(F.col("log_odds_ratio").desc())
top_open_trigrams   = combined_counts.orderBy(F.col("log_odds_ratio").asc())

# ----------------------------
# 8. Show top results
# ----------------------------
print("Trigrams indicative of closed stores:")
top_closed_trigrams.show(50, truncate=False)

print("Trigrams indicative of open stores:")
top_open_trigrams.show(50, truncate=False)


                                                                                

Trigrams indicative of closed stores:


                                                                                

+-------------------------------+------------+----------+------------------+
|ngram                          |closed_count|open_count|log_odds_ratio    |
+-------------------------------+------------+----------+------------------+
|scammers scammers scammers     |55          |0         |4.02535169073515  |
|fluffy fluffy fluffy           |17          |0         |2.8903717578961645|
|uuuuu uuuuu uuuuu              |16          |0         |2.833213344056216 |
|spend courteous recommend      |5           |0         |1.791759469228055 |
|gambar pertama adalah          |4           |0         |1.6094379124341003|
|particular recommend arugula   |4           |0         |1.6094379124341003|
|bathroom toilet prosper        |4           |0         |1.6094379124341003|
|generous alcohol ordering      |4           |0         |1.6094379124341003|
|recommend comparison many      |4           |0         |1.6094379124341003|
|ordered takes prepare          |4           |0         |1.6094379124341003|



+---------------------------------+------------+----------+-------------------+
|ngram                            |closed_count|open_count|log_odds_ratio     |
+---------------------------------+------------+----------+-------------------+
|lolol lolol lolol                |0           |363       |-5.8971538676367405|
|frickin frickin frickin          |0           |220       |-5.3981627015177525|
|wearing masks gloves             |0           |192       |-5.262690188904886 |
|worst customer wait              |0           |177       |-5.181783550292085 |
|wait waited almost               |0           |163       |-5.099866427824199 |
|wrong missing items              |0           |160       |-5.081404364984463 |
|took minutes cars                |1           |298       |-5.0072963928307415|
|gave someone wait                |0           |148       |-5.003946305945459 |
|covid highly recommend           |0           |139       |-4.941642422609305 |
|rude customer skills             |0    

                                                                                

In [32]:
# ----------------------------
# 4. Create 4-grams
# ----------------------------
four_grammer = NGram(n=4, inputCol="operational_tokens", outputCol="four_grams")
closed_df = four_grammer.transform(closed_df)
open_df   = four_grammer.transform(open_df)

# ----------------------------
# 5. Explode 4-grams and count
# ----------------------------
def four_gram_count(df, count_col):
    return df.select(F.explode("four_grams").alias("ngram")) \
             .groupBy("ngram").count().withColumnRenamed("count", count_col)

closed_counts = four_gram_count(closed_df, "closed_count")
open_counts   = four_gram_count(open_df, "open_count")

# ----------------------------
# 6. Combine counts and compute log-odds
# ----------------------------
combined_counts = closed_counts.join(open_counts, on="ngram", how="outer").fillna(0)
combined_counts = combined_counts.withColumn(
    "log_odds_ratio",
    F.log((F.col("closed_count")+1) / (F.col("open_count")+1))
)

# ----------------------------
# 7. Rank for comparative insight
# ----------------------------
top_closed_4_grams = combined_counts.orderBy(F.col("log_odds_ratio").desc())
top_open_4_grams   = combined_counts.orderBy(F.col("log_odds_ratio").asc())

# ----------------------------
# 8. Show top results
# ----------------------------
print("4-grams indicative of closed stores:")
top_closed_4_grams.show(50, truncate=False)

print("4-grams indicative of open stores:")
top_open_4_grams.show(50, truncate=False)


4-grams indicative of closed stores:


                                                                                

+-----------------------------------------+------------+----------+------------------+
|ngram                                    |closed_count|open_count|log_odds_ratio    |
+-----------------------------------------+------------+----------+------------------+
|scammers scammers scammers scammers      |61          |0         |4.127134385045092 |
|fluffy fluffy fluffy fluffy              |16          |0         |2.833213344056216 |
|uuuuu uuuuu uuuuu uuuuu                  |9           |0         |2.302585092994046 |
|gambar pertama adalah facebook           |4           |0         |1.6094379124341003|
|dengan gambar pertama adalah             |4           |0         |1.6094379124341003|
|called said cheating expensive           |4           |0         |1.6094379124341003|
|alcohol ordering issue fairly            |4           |0         |1.6094379124341003|
|said cheating expensive without          |4           |0         |1.6094379124341003|
|ketiga dengan gambar pertama             |



+-----------------------------------------+------------+----------+-------------------+
|ngram                                    |closed_count|open_count|log_odds_ratio     |
+-----------------------------------------+------------+----------+-------------------+
|lolol lolol lolol lolol                  |0           |262       |-5.572154032177765 |
|frickin frickin frickin frickin          |0           |176       |-5.176149732573829 |
|regularly charge convenient rates        |0           |117       |-4.770684624465665 |
|comes instantly many types               |0           |95        |-4.564348191467836 |
|boner boner boner boner                  |0           |93        |-4.543294782270004 |
|trying consistently towards visitors     |1           |182       |-4.516338972281476 |
|constantly charge reasonable rates       |0           |90        |-4.51085950651685  |
|cars ahead took minutes                  |0           |90        |-4.51085950651685  |
|surely recommend visiting custo

                                                                                

In [33]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram

# ----------------------------
# 1. Filter closed and open store reviews
# ----------------------------
closed_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 1)
)

open_df = combined_df.filter(
    (F.col("text").isNotNull()) &
    (F.col("rating").isNotNull()) &
    (F.col("permanent_closed") == 0)
)

# Optional: sample small fraction for testing
closed_df = closed_df.sample(fraction=1.0, seed=42)
open_df   = open_df.sample(fraction=1.0, seed=42)

# ----------------------------
# 2. Tokenize and clean
# ----------------------------
def clean_tokens(df):
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    df = tokenizer.transform(df)

    stop_remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
    df = stop_remover.transform(df)

    # Keep only alphabetic tokens of length >= 3
    df = df.withColumn(
        "english_tokens",
        F.expr("filter(transform(tokens_clean, x -> lower(x)), x -> x rlike '^[a-z]{3,}$')")
    )

    # Remove repeated-letter nonsense tokens like 'aaaa' or 'uuu'
    df = df.withColumn(
        "english_tokens",
        F.expr("filter(english_tokens, x -> not x rlike '^(.)\\1+$')")
    )
    return df

closed_df = clean_tokens(closed_df)
open_df   = clean_tokens(open_df)

# ----------------------------
# 3. Remove store name words using a UDF (safe for special characters)
# ----------------------------
store_words_list = (
    combined_df
    .select(F.lower(F.col("store_name")).alias("name"))
    .select(F.split("name", " ").alias("words"))
    .select(F.explode("words").alias("w"))
    .filter(F.length("w") > 2)
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)
broadcast_store_words = spark.sparkContext.broadcast(set(store_words_list))

def remove_store_words(tokens):
    if tokens is None:
        return []
    return [t for t in tokens if t not in broadcast_store_words.value]

remove_store_words_udf = F.udf(remove_store_words, ArrayType(StringType()))

closed_df = closed_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))
open_df   = open_df.withColumn("operational_tokens", remove_store_words_udf("english_tokens"))

# ----------------------------
# 4. Create trigrams
# ----------------------------
trigrammer = NGram(n=3, inputCol="operational_tokens", outputCol="trigrams")
closed_df = trigrammer.transform(closed_df)
open_df   = trigrammer.transform(open_df)

# ----------------------------
# 5. Explode trigrams and count
# ----------------------------
def trigram_count(df, count_col):
    return df.select(F.explode("trigrams").alias("ngram")) \
             .groupBy("ngram").count().withColumnRenamed("count", count_col)

closed_counts = trigram_count(closed_df, "closed_count")
open_counts   = trigram_count(open_df, "open_count")

# ----------------------------
# 6. Combine counts and compute log-odds
# ----------------------------
combined_counts = closed_counts.join(open_counts, on="ngram", how="outer").fillna(0)
combined_counts = combined_counts.withColumn(
    "log_odds_ratio",
    F.log((F.col("closed_count")+1) / (F.col("open_count")+1))
)

# ----------------------------
# 7. Rank for comparative insight
# ----------------------------
top_closed_trigrams = combined_counts.orderBy(F.col("log_odds_ratio").desc())
top_open_trigrams   = combined_counts.orderBy(F.col("log_odds_ratio").asc())

# ----------------------------
# 8. Show top results
# ----------------------------
print("Trigrams indicative of closed stores:")
top_closed_trigrams.show(50, truncate=False)

print("Trigrams indicative of open stores:")
top_open_trigrams.show(50, truncate=False)


                                                                                

Trigrams indicative of closed stores:


                                                                                

+------------------------------+------------+----------+------------------+
|ngram                         |closed_count|open_count|log_odds_ratio    |
+------------------------------+------------+----------+------------------+
|scammers scammers scammers    |61          |0         |4.127134385045092 |
|fluffy fluffy fluffy          |17          |0         |2.8903717578961645|
|uuuuu uuuuu uuuuu             |10          |0         |2.3978952727983707|
|spend courteous recommend     |5           |0         |1.791759469228055 |
|professional decor try        |4           |0         |1.6094379124341003|
|items waited patiently        |4           |0         |1.6094379124341003|
|including sparkling multitudes|4           |0         |1.6094379124341003|
|unexpected professional decor |4           |0         |1.6094379124341003|
|issue fairly fooled           |4           |0         |1.6094379124341003|
|ordering issue fairly         |4           |0         |1.6094379124341003|
|cheater adv



+---------------------------------+------------+----------+-------------------+
|ngram                            |closed_count|open_count|log_odds_ratio     |
+---------------------------------+------------+----------+-------------------+
|lolol lolol lolol                |0           |312       |-5.746203190540153 |
|frickin frickin frickin          |0           |305       |-5.723585101952381 |
|wearing masks gloves             |0           |192       |-5.262690188904886 |
|worst customer wait              |0           |159       |-5.075173815233827 |
|wait waited almost               |0           |154       |-5.043425116919247 |
|waited minutes cars              |2           |462       |-5.039114765418124 |
|took minutes cars                |1           |300       |-5.01396308418893  |
|ordered asked wait               |0           |144       |-4.976733742420574 |
|gave someone wait                |0           |144       |-4.976733742420574 |
|gave wrong gave                  |1    

                                                                                

In [34]:
# ----------------------------
# 4. Create 5-grams
# ----------------------------
five_grammer = NGram(n=5, inputCol="operational_tokens", outputCol="five_grams")
closed_df = five_grammer.transform(closed_df)
open_df   = five_grammer.transform(open_df)

# ----------------------------
# 5. Explode 5-grams and count
# ----------------------------
def five_gram_count(df, count_col):
    return df.select(F.explode("five_grams").alias("ngram")) \
             .groupBy("ngram").count().withColumnRenamed("count", count_col)

closed_counts = five_gram_count(closed_df, "closed_count")
open_counts   = five_gram_count(open_df, "open_count")

# ----------------------------
# 6. Combine counts and compute log-odds
# ----------------------------
combined_counts = closed_counts.join(open_counts, on="ngram", how="outer").fillna(0)
combined_counts = combined_counts.withColumn(
    "log_odds_ratio",
    F.log((F.col("closed_count")+1) / (F.col("open_count")+1))
)

# ----------------------------
# 7. Rank for comparative insight
# ----------------------------
top_closed_5_grams = combined_counts.orderBy(F.col("log_odds_ratio").desc())
top_open_5_grams   = combined_counts.orderBy(F.col("log_odds_ratio").asc())

# ----------------------------
# 8. Show top results
# ----------------------------
print("5-grams indicative of closed stores:")
top_closed_5_grams.show(50, truncate=False)

print("5-grams indicative of open stores:")
top_open_5_grams.show(50, truncate=False)


5-grams indicative of closed stores:


                                                                                

+--------------------------------------------------+------------+----------+------------------+
|ngram                                             |closed_count|open_count|log_odds_ratio    |
+--------------------------------------------------+------------+----------+------------------+
|scammers scammers scammers scammers scammers      |53          |0         |3.9889840465642745|
|fluffy fluffy fluffy fluffy fluffy                |15          |0         |2.772588722239781 |
|informed refund almost year passed                |4           |0         |1.6094379124341003|
|contacted credit informed refund almost           |4           |0         |1.6094379124341003|
|refund almost year passed received                |4           |0         |1.6094379124341003|
|gambar pertama adalah facebook sebelum            |4           |0         |1.6094379124341003|
|called said cheating expensive without            |4           |0         |1.6094379124341003|
|ketiga dengan gambar pertama adalah    



+---------------------------------------------------+------------+----------+-------------------+
|ngram                                              |closed_count|open_count|log_odds_ratio     |
+---------------------------------------------------+------------+----------+-------------------+
|frickin frickin frickin frickin frickin            |0           |301       |-5.71042701737487  |
|lolol lolol lolol lolol lolol                      |0           |291       |-5.676753802268282 |
|blah blah blah blah blah                           |0           |78        |-4.3694478524670215|
|boner boner boner boner boner                      |0           |70        |-4.2626798770413155|
|trying consistently towards visitors often         |1           |133       |-4.204692619390966 |
|pacer meme deadthe pacer meme                      |0           |65        |-4.189654742026425 |
|meme deadthe pacer meme deadthe                    |0           |65        |-4.189654742026425 |
|deadthe pacer meme 

                                                                                