In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql.functions import month, year, col, explode

import pandas as pd
import matplotlib.pyplot as plt


In [None]:
spark = SparkSession.builder \
    .appName("Tokyo Airbnb Analysis") \
    .getOrCreate()

In [None]:
df_calendar = spark.read.csv("/user1/dataset/calendar.csv", header=True, inferSchema=True)
df_calendar = df_calendar.withColumn("date", to_date(df_calendar.date, 'yyyy-MM-dd'))
df_calendar.show(5)

In [None]:
df_busy_times = df_calendar.where(col("available") == 'f') \
                  .groupBy(year("date").alias("year"), month("date").alias("month")) \
                  .count() 

In [None]:
df_busy_times = df_calendar.where(col("available") == 'f') \
                  .groupBy(year("date").alias("year"), month("date").alias("month")) \
                  .count() \
                  .orderBy("year", "month")


In [None]:
pandas_df = df_busy_times.toPandas()
pandas_df.sort_values(['year', 'month', 'year'], ascending=True, inplace=True)

In [None]:
pandas_df

In [None]:
pandas_df.plot(x='month', y='count', kind='bar')

In [None]:
filename_data = '/user1/dataset/neighbourhoods.geojson'
# Load the main data set into pyspark data frame 
df = spark.read.json(filename_data, mode="DROPMALFORMED")
print('Data frame type: ' + str(type(df)))

In [None]:
print('++Data overview+++')
df.printSchema()
print('++Columns overview++')
pd.DataFrame(df.dtypes, columns = ['Column Name','Data type'])

In [None]:
df = df.withColumn("features", explode(col("features")))

In [None]:
df.printSchema()

In [None]:
print('++Columns overview++')
pd.DataFrame(df.dtypes, columns = ['Column Name','Data type'])

In [None]:
df.select("features.properties.neighbourhood", "features.geometry.type").show(2, truncate=False)


In [None]:
distinct_geometry_types = df.select("features.geometry.type").distinct()
distinct_geometry_types.show(truncate=False)

In [None]:
distinct_geometry_types = df.select("features.properties").distinct()
distinct_geometry_types.show(truncate=False)

In [None]:
# Dropping the 'neighbourhood_group' because it's null
df = df.withColumn("features", col("features").withField("properties",
    col("features.properties").dropFields("neighbourhood_group")))

# print updated schema
df.printSchema()

In [None]:
geojson_dict = df.select("features.").toJSON()
import json

json_strings = geojson_dict.collect()

# Create a GeoJSON structure from the collected JSON strings
geojson_features = [json.loads(j) for j in json_strings]

geojson_data = {
    "type": "FeatureCollection",
    "features": geojson_features}

In [None]:
newrdd = geojson_dict.map(lambda x : (x[0],x))

a = newrdd.collectAsMap()

In [None]:
a

In [None]:
import folium

In [None]:
# Initialize the map with the central location
m = folium.Map(location=[35.7002, 139.8738], zoom_start=9)

# Add the GeoJSON layer
folium.GeoJson(
    geojson_data,
    style_function=lambda x: {
        'fillColor': 'orange',
        'color': 'black',
        'weight': 2,
        'fillOpacity': 0.5},
    name='GeoJSON Layer').add_to(m)

# Add layer control
folium.LayerControl().add_to(m)

m