## Using Spark Cluster with shared data in Docker

### Handle Spark DataFrames

In [None]:
!pip install pyspark==3.5.3 pandas setuptools packaging

#### Download csv file to local directory (shared with spark)

In [None]:
import urllib.request
import zipfile
from os import remove

url = 'https://www.kaggle.com/api/v1/datasets/download/chaitanyahivlekar/large-movie-dataset'
urllib.request.urlretrieve(url,'movies.zip')

with zipfile.ZipFile('movies.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

remove('movies.zip')

#### Connect to Spark Cluster and create Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MyAppSDF") \
    .master("spark://spark-master:7077") \
    .getOrCreate()  



#### Read data
##### Create a spark data frame from csv file

To create a spark data frame from a csv file, we can use the `read.csv` function. Using `inferSchema=True` allow spark infer correct type for each field. However, it can be slow.

In [None]:
sdf = spark.read.csv("movies_dataset.csv", header=True, inferSchema=True)

To see the schema, use `.printSchema()` method.

In [None]:
sdf.printSchema()

Let's show first rows of spark data frame with method `.show()`

In [None]:
sdf.show()

Count total of rows.

In [None]:
sdf.count()

#### Filter data


In [None]:
from pyspark.sql.functions import col

sdf.filter(col("Rating") >= 5 ).show()

In [None]:


sdf.filter( (col("Rating") >= 5) & col('Genre').contains("Comedy") ).show()


In [None]:
sdf.filter( (col('Genre').contains("Action")) | col('Genre').contains("Comedy") ).show()

In [None]:
sdf_top = sdf.filter( (col("Rating") >= 5) & col('Genre').contains("Comedy") )
sdf_top.show()


#### Transform data

In [None]:
from pyspark.sql import functions as sf

sdf_top.groupBy("Movie_name").agg(
    sf.count("*").alias("total_ratings"),
).show()

In [None]:
sdf.filter( sf.col('Genre').contains("Comedy") ). \
    groupBy("Movie_name").agg(
        sf.count("*").alias("total_ratings"),
        sf.avg("Rating").alias("avg_rating")
    ).show()

In [None]:
sdf.filter( sf.col('Genre').contains("Comedy") ). \
    groupBy("Movie_name").agg(
        sf.count("*").alias("total_ratings"),
        sf.avg("Rating").alias("avg_rating"),
        sf.sum("Rating").alias("sum_rating")
    ). \
   orderBy("total_ratings", ascending=False).show()

In [None]:
from pyspark.sql import functions as sf

df = sdf.filter( sf.col('Genre').contains("Comedy") ). \
    groupBy("Movie_name").agg(
        sf.count("*").alias("total_ratings"),
        sf.avg("Rating").alias("avg_rating"),
        sf.sum("Rating").alias("sum_rating")
    ). \
    filter( sf.col("avg_rating") > 4). \
    orderBy("total_ratings", ascending=False). \
    toPandas()

df