# Merging ALS Data

In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5ca683bd4467ac3d0f3539a5db5a361641e8829053085dc84a7373815901d640
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS


## Hawaii Data

### 10%

In [None]:
# Load DataFrame from the already loaded parquet file
hi_10 = pd.read_parquet("/content/hi_10.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
hi_10_spark = spark.createDataFrame(hi_10)

# Use StringIndexer to transform business and user columns
hi_10_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(hi_10_spark).transform(hi_10_spark)
hi_10_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(hi_10_transform).transform(hi_10_transform)

# Randomly split hi_10_transform into training and testing sets
train_10, test_10 = hi_10_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_10.write.parquet("/train_10.parquet")
test_10.write.parquet("/test_10.parquet")





  self.pid = _posixsubprocess.fork_exec(


In [None]:
# Read the training and testing sets from parquet files
train_10 = spark.read.parquet("/train_10.parquet")
test_10 = spark.read.parquet("/test_10.parquet")

In [None]:
# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_10)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_10 = train_10 \
    .join(user_factors, train_10["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_10["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_10 = test_10 \
    .join(user_factors, test_10["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_10["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_10.write.parquet("/train_final_10.parquet")
test_final_10.write.parquet("/test_final_10.parquet")

### 20%

In [None]:
# Load DataFrame from the already loaded parquet file
hi_20 = pd.read_parquet("/content/hi_20.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
hi_20_spark = spark.createDataFrame(hi_20)

# Use StringIndexer to transform business and user columns
hi_20_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(hi_20_spark).transform(hi_20_spark)
hi_20_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(hi_20_transform).transform(hi_20_transform)

# Randomly split hi_10_transform into training and testing sets
train_20, test_20 = hi_20_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_20.write.parquet("/train_20.parquet")
test_20.write.parquet("/test_20.parquet")

In [None]:
# Read the training and testing sets from parquet files
train_20 = spark.read.parquet("/train_20.parquet")
test_20 = spark.read.parquet("/test_20.parquet")

In [None]:
als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_20)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_20 = train_20 \
    .join(user_factors, train_20["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_20["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_20 = test_20 \
    .join(user_factors, test_20["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_20["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_20.write.parquet("/train_final_20.parquet")
test_final_20.write.parquet("/test_final_20.parquet")


### 50%

In [None]:
# Load DataFrame from the already loaded parquet file
hi_50 = pd.read_parquet("/content/hi_10.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
hi_50_spark = spark.createDataFrame(hi_50)

# Use StringIndexer to transform business and user columns
hi_50_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(hi_50_spark).transform(hi_50_spark)
hi_50_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(hi_50_transform).transform(hi_50_transform)

# Randomly split hi_10_transform into training and testing sets
train_50, test_50 = hi_10_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_50.write.parquet("/train_50.parquet")
test_50.write.parquet("/test_50.parquet")


In [None]:
# Read the training and testing sets from parquet files
train_50 = spark.read.parquet("/train_50.parquet")
test_50 = spark.read.parquet("/test_50.parquet")


In [None]:
als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_50)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_50 = train_50 \
    .join(user_factors, train_50["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_50["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_50 = test_50 \
    .join(user_factors, test_50["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_50["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_50.write.parquet("/train_final_50.parquet")
test_final_50.write.parquet("/test_final_50.parquet")


### 80%

In [None]:
# Load DataFrame from the already loaded parquet file
hi_80 = pd.read_parquet("/content/hi_80.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
hi_80_spark = spark.createDataFrame(hi_80)

# Use StringIndexer to transform business and user columns
hi_80_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(hi_80_spark).transform(hi_80_spark)
hi_80_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(hi_80_transform).transform(hi_80_transform)

# Randomly split hi_10_transform into training and testing sets
train_80, test_80 = hi_10_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_80.write.parquet("/train_80.parquet")
test_80.write.parquet("/test_80.parquet")

In [None]:
# Read the training and testing sets from parquet files
train_80 = spark.read.parquet("/train_80.parquet")
test_80 = spark.read.parquet("/test_80.parquet")


In [None]:
als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_80)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_80 = train_80 \
    .join(user_factors, train_80["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_80["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_80 = test_80 \
    .join(user_factors, test_80["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_80["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_80.write.parquet("/train_final_80.parquet")
test_final_80.write.parquet("/test_final_80.parquet")

### 100%

In [None]:
# Load DataFrame from the already loaded parquet file
hi_100 = pd.read_parquet("/content/hi_100.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
hi_100_spark = spark.createDataFrame(hi_100)

# Use StringIndexer to transform business and user columns
hi_100_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(hi_100_spark).transform(hi_100_spark)
hi_100_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(hi_100_transform).transform(hi_100_transform)

# Randomly split hi_10_transform into training and testing sets
train_100, test_100 = hi_100_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_100.write.parquet("/train_100.parquet")
test_100.write.parquet("/test_100.parquet")

In [None]:
# Read the training and testing sets from parquet files
train_100 = spark.read.parquet("/train_100.parquet")
test_100 = spark.read.parquet("/test_100.parquet")


In [None]:

als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_100)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_100 = train_100 \
    .join(user_factors, train_100["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_100["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_100 = test_100 \
    .join(user_factors, test_100["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_100["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_100.write.parquet("/train_final_100.parquet")
test_final_100.write.parquet("/test_final_100.parquet")


## California Data

In [None]:
# Load DataFrame from the already loaded parquet file
ca_10 = pd.read_parquet("/content/ca_10.parquet")

# Initialize SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

# Convert the sampled pandas DataFrame to Spark DataFrame
ca_10_spark = spark.createDataFrame(ca_10)

# Use StringIndexer to transform the business and user columns
ca_10_transform = StringIndexer(inputCol="business", outputCol="business_id").fit(ca_10_spark).transform(ca_10_spark)
ca_10_transform = StringIndexer(inputCol="user", outputCol="user_id").fit(ca_10_transform).transform(ca_10_transform)

# Randomly split hi_10_transform into training and testing sets
train_10, test_10 = ca_10_transform.randomSplit([0.9, 0.1], seed=42)

# Write the training and testing sets to parquet files
train_10.write.parquet("/train_10.parquet")
test_10.write.parquet("/test_10.parquet")


In [None]:
# Read the training and testing sets from parquet files
train_10 = spark.read.parquet("/train_10.parquet")
test_10 = spark.read.parquet("/test_10.parquet")


In [None]:
als = ALS(rank=60, maxIter=20, regParam=0.1, userCol="user_id", itemCol="business_id", ratingCol="rating_binary", coldStartStrategy="drop")
als_model = als.fit(train_10)

# Extract ALS user and item factor matrices
user_factors = als_model.userFactors
item_factors = als_model.itemFactors

# Merge user factors and item factors to the training dataset
user_factors = user_factors.withColumnRenamed("features", "user_features")
item_factors = item_factors.withColumnRenamed("features", "business_features")

train_final_10 = train_10 \
    .join(user_factors, train_10["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, train_10["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Merge user factors and item factors to the test dataset
test_final_10 = test_10 \
    .join(user_factors, test_10["user_id"] == user_factors["id"], how='left') \
    .drop(user_factors["id"]) \
    .join(item_factors, test_10["business_id"] == item_factors["id"], how='left') \
    .drop(item_factors["id"])

# Save the PySpark DataFrame as Parquet files
train_final_10.write.parquet("/train_final_10.parquet")
test_final_10.write.parquet("/test_final_10.parquet")