# Defined Vs Inferred Schemas in Pyspark

# Inferred Schema

In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("InferSchemaCSV").getOrCreate()

### Reading in a CSV Without Defining or Inferring the Schema. 

In [0]:
# default for inferSchema is False
df = spark.read.csv('dbfs:/FileStore/data.csv', header= True)
df.display()

Name,Age,Salary
Alice,25,50000.0
Bob,30,60000.5
Charlie,35,70000.0
David,40,80000.2
Eve,28,55000.3


In [0]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Salary: string (nullable = true)



### Reading in the same CSV with inferSchema = True 

In [0]:
inferred_df = spark.read.csv('dbfs:/FileStore/data.csv', header= True, inferSchema=True)
inferred_df.display()
inferred_df.printSchema()

Name,Age,Salary
Alice,25,50000.0
Bob,30,60000.5
Charlie,35,70000.0
David,40,80000.2
Eve,28,55000.3


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: double (nullable = true)



### Reading in the CSV and creating a spark dataframe with a predefined schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

custom_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", DoubleType(), True),
    StructField("Salary", DoubleType(), True)
])

defined_df = spark.read.csv('dbfs:/FileStore/data.csv', header=True, schema = custom_schema)

defined_df.display()
defined_df.printSchema()

Name,Age,Salary
Alice,25.0,50000.0
Bob,30.0,60000.5
Charlie,35.0,70000.0
David,40.0,80000.2
Eve,28.0,55000.3


root
 |-- Name: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Salary: double (nullable = true)

