# Pyspark

In [None]:
import pyspark

In [None]:
pyspark.__version__

## Creating a cluster

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[1]').appName('testing_spark').getOrCreate()


In [None]:
# Good practice is to stop the cluster 
spark.stop()

# DataFrames

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

data = [
    (1, "John", 28, 175.5),
    (2, "Anna", 23, 160.2),
    (3, "Mike", 35, 180.3)
]

# Define the schema with specific data types
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Height", FloatType(), True)
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()

# Print the schema to verify data types
df.printSchema()

## Loading Data 

In [None]:
cars = spark.read.csv('data/cars.csv', header=True)
cars.limit(10).toPandas().head()

Loading data has the following arguments: 
- header: is the first ow a header?
- sep: field separator
- schema: explicit column data types
- inferSchema: deduca column data types from data
- nullValue: placeholder for missing data


In [None]:
cars.printSchema()

At this point everything is a string in this dataframe!! 

We can let spark infer the data types of each column or explicitly define their types.



In [None]:
cars = spark.read.csv('data/cars.csv', header=True, inferSchema=True)
cars.limit(10).toPandas().head()

In [None]:
cars.printSchema()

Null values could mislead spark and assign string due to the presence of 'NA' or similar strings.

If automatic type inference is not applicable we can explicitly declare the data types like: 

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

data = [
    (1, "John", 28, 175.5),
    (2, "Anna", 23, 160.2),
    (3, "Mike", 35, 180.3)
]

# Define the schema with specific data types
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Height", FloatType(), True)
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()

# Print the schema to verify data types
df.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('data/sms.csv', sep=';', header=False, schema=schema)

# Print schema of DataFrame
sms.printSchema()

In [None]:
# Read data from CSV file
flights = spark.read.csv('data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
print(flights.dtypes)