# Pyspark

In [None]:
import pyspark

In [None]:
pyspark.__version__

## Creating a cluster

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[1]').appName('testing_spark').getOrCreate()


In [None]:
# Good practice is to stop the cluster 
# spark.stop()

# DataFrames

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

data = [
    (1, "John", 28, 175.5),
    (2, "Anna", 23, 160.2),
    (3, "Mike", 35, 180.3)
]

# Define the schema with specific data types
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Height", FloatType(), True)
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()

# Print the schema to verify data types
df.printSchema()

## Loading Data 

In [None]:
cars = spark.read.csv('data/cars.csv', header=True)
cars.limit(10).toPandas().head()

Loading data has the following arguments: 
- header: is the first ow a header?
- sep: field separator
- schema: explicit column data types
- inferSchema: deduca column data types from data
- nullValue: placeholder for missing data


In [None]:
cars.printSchema()

At this point everything is a string in this dataframe!! 

We can let spark infer the data types of each column or explicitly define their types.



In [None]:
cars = spark.read.csv('data/cars.csv', header=True, inferSchema=True)
cars.limit(10).toPandas().head()

In [None]:
cars.printSchema()

Null values could mislead spark and assign string due to the presence of 'NA' or similar strings.

If automatic type inference is not applicable we can explicitly declare the data types like: 

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

data = [
    (1, "John", 28, 175.5),
    (2, "Anna", 23, 160.2),
    (3, "Mike", 35, 180.3)
]

# Define the schema with specific data types
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Height", FloatType(), True)
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()

# Print the schema to verify data types
df.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('data/sms.csv', sep=';', header=False, schema=schema)

# Print schema of DataFrame
sms.printSchema()

In [None]:
# Read data from CSV file
flights = spark.read.csv('data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
print(flights.dtypes)

# Data Preparation

In [None]:
cars = spark.read.csv('data/auto.csv', header=True, inferSchema=True)
cars.limit(10).toPandas().head()

In [None]:
cars.limit(10).toPandas().columns

In [None]:
cars = cars.drop('name')

In [None]:
cars = cars.select('origin', 'cylinders', 'weight', 'horsepower', 'mpg')

In [None]:
cars.filter('cylinders IS NULL').count()

In [None]:
cars.filter('weight IS NULL').count()

In [None]:
cars.filter('mpg IS NULL').count()

In [None]:
from pyspark.sql.functions import round

cars = cars.withColumn('mass', round(cars.weight/2.205, 0))

In [None]:
cars.limit(10).toPandas().head()

In [None]:
from pyspark.ml.feature import StringIndexer 

indexer = StringIndexer(inputCol='origin', outputCol='origin_idx')

indexer = indexer.fit(cars)

cars = indexer.transform(cars)

In [None]:
cars.limit(100).toPandas().head(20)

The final step in preparing the cars data is to consolidate the various input columns into a single column. This is necessary because the Machine Learning algorithms in Spark operate on a single vector of predictors, although each element in that vector may consist of multiple values.

In [None]:
from pyspark.ml.feature import VectorAssembler 

assembler = VectorAssembler(inputCols=['cylinders', 'origin_idx'], outputCol = 'features')
cars = assembler.transform(cars)

In [None]:
cars.limit(100).toPandas().head(20)

In [None]:
# Remove the 'flight' column
flights_drop_column = flights.drop('flight')

# Number of records with missing 'delay' values
flights_drop_column.filter('delay IS NULL').count()

# Remove records with missing 'delay' values
flights_valid_delay = flights_drop_column.filter('delay IS NOT NULL')

# Remove records with missing values in any column and get the number of remaining rows
flights_none_missing = flights_valid_delay.dropna()
print(flights_none_missing.count())

In [None]:
# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column (1 mile is equivalent to 1.60934 km)
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km.delay >= 15).cast('integer'))

# Check first five records
flights_km.show(5)

In [None]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights_km)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights_km)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
flights_indexed.show(5)

In [None]:
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow','carrier_idx','org_idx','km','depart','duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_indexed)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)