In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import  when, col, to_date, unix_timestamp, regexp_replace 
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import FloatType

spark = (SparkSession.builder \
    .appName("Tokyo Airbnb Analysis")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "4g")  
    #.config("spark.driver.maxResultSize", "4g")
    .config("spark.network.timeout", "600s") 
    .config("spark.executor.heartbeatInterval", "120s") 
    .getOrCreate())

dataset_path = "/user1/dataset/calendar.csv"


# Load and preprocess the data
df = spark.read.csv(dataset_path, header=True, inferSchema=True)
df.show()


df = df.withColumn("price", regexp_replace(col("price"), "[\$,]", "").cast(FloatType()))
df = df.withColumn("available", when(col("available") == "t", 1).otherwise(0)) \
    .withColumn("date_unix", unix_timestamp("date"))

df = df.withColumn("date", to_date(df.date, 'yyyy-MM-dd')).orderBy(col("date"))


# StringIndexer for the 'listing_id' if it's categorical
indexer = StringIndexer(inputCol="listing_id", outputCol="listing_id_indexed")

# Assemble features
assembler = VectorAssembler(inputCols=["listing_id", #"year", "month", "day", 
                                       'date_unix', "price"], outputCol="features")

df = df.drop(*['adjusted_price', 'minimum_nights', 'maximum_nights', 'date'])
df = df.limit(10000)

# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)


# Train a classification model
classifier = RandomForestClassifier(labelCol="available", featuresCol="features")

# Create a pipeline
pipeline = Pipeline(stages=[indexer, assembler, classifier])

# Train the model
model = pipeline.fit(train_data)
# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="available")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy}")



In [2]:
spark = (SparkSession.builder \
    .appName("Tokyo Airbnb Analysis")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "4g")  
    #.config("spark.driver.maxResultSize", "4g")
    .config("spark.network.timeout", "600s") 
    .config("spark.executor.heartbeatInterval", "120s") 
    .getOrCreate())


24/04/19 02:26:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
dataset_path = "/user1/dataset/calendar.csv"


# Load and preprocess the data
df = spark.read.csv(dataset_path, header=True, inferSchema=True)
df.show()

                                                                                

+----------+----------+---------+----------+--------------+--------------+--------------+
|listing_id|      date|available|     price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+----------+--------------+--------------+--------------+
|    197677|2023-06-29|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-06-30|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-01|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-02|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-03|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-04|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-05|        f|$11,000.00|    $11,000.00|             3|          1125|
|    197677|2023-07-06|        f|$11,000.00|    $11,000.00|             3|          1125|
|    19767

In [None]:

df = df.withColumn("price", regexp_replace(col("price"), "[\$,]", "").cast(FloatType()))
df = df.withColumn("available", when(col("available") == "t", 1).otherwise(0)) \
    .withColumn("date_unix", unix_timestamp("date"))

df = df.withColumn("date", to_date(df.date, 'yyyy-MM-dd')).orderBy(col("date"))


# StringIndexer for the 'listing_id' if it's categorical
indexer = StringIndexer(inputCol="listing_id", outputCol="listing_id_indexed")

# Assemble features
assembler = VectorAssembler(inputCols=["listing_id", #"year", "month", "day", 
                                       'date_unix', "price"], outputCol="features")

df = df.drop(*['adjusted_price', 'minimum_nights', 'maximum_nights', 'date'])
df = df.limit(10000)

# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

In [None]:
train_data.show(4)

In [None]:
# Train a classification model
classifier = RandomForestClassifier(labelCol="available", featuresCol="features")

# Create a pipeline
pipeline = Pipeline(stages=[indexer, assembler, classifier])

# Train the model
model = pipeline.fit(train_data)

In [None]:
# Make predictions
predictions = model.transform(test_data)

In [None]:
# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="available")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy}")
