# Spark ML - Exploratory analysis

## Prepare the Spark session


In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Package import

In [None]:
# Import required packges
import pandas as pd
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from plotnine import *

## Read a sample CSV

In [None]:
# Read a sample data set
data = spark.read.options(sep=',', header=True, inferSchema=True).csv('./data/housing.csv')

## Structure check

In [None]:
# Schema check
data.printSchema()

In [None]:
# Types check
data.dtypes

In [None]:
# Columns check
data.columns

In [None]:
# Sample rows
data.show(5)

## Dataset size

In [None]:
# Dataset size - Equivalent to pandas shape
(data.count(), len(data.columns))

## Basic statistics

In [None]:
# Basic statistics 
data.describe().toPandas().set_index('summary')

## Missing values

In [None]:
data.select([F.sum(data[col].isNull().cast('Double')).alias(col) for col in data.columns]).show()

## Correlations

In [None]:
# Assemble features into one single Vector column
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
corr_df = assembler.transform(data)

In [None]:
# Calculate correlations
correlations = Correlation.corr(corr_df, 'features')

In [None]:
# Extract correlations and visualize them
pd.DataFrame(correlations.first()[0].toArray(), index=data.columns, columns=data.columns)

## Data visualization

In [None]:
# Prepare numerical and categorical feature list
numerical_features = [col for (col, datatype) in data.dtypes if datatype == 'double']
categorical_features = [col for (col, datatype) in data.dtypes if datatype == 'int']

In [None]:
# Density plot for numerical features
for col in numerical_features:
    (ggplot(data.select(col).toPandas(), aes(x=col)) + geom_density()).draw()

In [None]:
# Frequency charts for categorical features
for col in categorical_features:
    (ggplot(data.groupby(col).count().toPandas(), aes(x=col, y='count')) + geom_bar(stat='identity')).draw()

## Close the session

In [72]:
spark.stop()