In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql import functions as F

# 1. Define schema explicitly
schema = StructType([
    StructField("sepal_length", DoubleType(), True),
    StructField("sepal_width", DoubleType(), True),
    StructField("petal_length", DoubleType(), True),
    StructField("petal_width", DoubleType(), True),
    StructField("species", StringType(), True)
])

# 2. Read CSV with schema
data = spark.read.csv(
    "/Volumes/newone/day3/files/iris.csv",
    schema=schema,
    header=True
)

# Show nullable info
for field in schema.fields:
    print(f"Column: {field.name}, Type: {field.dataType}, Nullable: {field.nullable}")

# 3. Add a new column (example: petal_area = petal_length * petal_width)
data = data.withColumn("petal_area", F.col("petal_length") * F.col("petal_width"))

data.show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

data = [
    (("John", "A", "Doe"), ["Python", "Java", "Scala"], ("California", "USA")),
    (("Jane", "B", "Smith"), ["C++", "Go"], ("Texas", "USA")),
    (("Raj", "K", "Verma"), ["Python", "JavaScript"], ("Maharashtra", "India"))
]

schema = StructType([
    StructField("full_name", StructType([
        StructField("first", StringType(), True),
        StructField("middle", StringType(), True),
        StructField("last", StringType(), True)
    ]), True),
    StructField("programming_languages", ArrayType(StringType()), True),
    StructField("location", StructType([
        StructField("state", StringType(), True),
        StructField("country", StringType(), True)
    ]), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)
df.show()

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, MapType
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])

dataDictionary = [
        ('James',{'hair':'black'}),
        ('Michael',{'hair':'brown'}),
        ('Robert',{'hair':'red'}),
        ('Washington',{'hair':'grey'}),
        ('Jefferson',{'hair':'brown'})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.show()


In [0]:
from pyspark.sql import SparkSession, Row

rdd = spark.createDataFrame([Row(name = "James",age = 43),Row(name = "Anna",age = 33)])
rdd.collect()