In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *
from utils.helper import *


In [195]:
print("Spark Version:", spark.version)

Spark Version: 3.5.1


In [196]:
# Method 1: From Python list with schema inference
data = [
    (1, "Alice", 28, "Engineering", 75000),
    (2, "Bob", 35, "Sales", 65000),
    (3, "Charlie", 32, "Engineering", 80000),
    (4, "Diana", 29, "Marketing", 70000),
    (5, "Eve", 41, "Sales", 72000)  
]

In [197]:
df=spark.createDataFrame(data,["id", "name", "age", "department", "salary"])
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [198]:

schema = "id INT, name STRING, age INT, department STRING, salary INT"
df = spark.createDataFrame(data, schema)
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



In [199]:
schema=StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True)
])
df_schema=spark.createDataFrame(data,schema)
df_schema.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



# DataFrame Introduction


In [200]:
dict_data=[{"id": 1, "name": "Alice", "age": 28, "department": "Engineering", "salary": 75000},
{"id": 2, "name": "Bob", "age": 35, "department": "Sales", "salary": 65000},
{"id": 3, "name": "Charlie", "age": 32, "department": "Engineering", "salary": 80000},
{"id": 4, "name": "Diana", "age": 29, "department": "Marketing", "salary": 70000},
{"id": 5, "name": "Eve", "age": 41, "department": "Sales", "salary": 72000}]
print_rich_table(dict_data,"Dict Data")
df = spark.createDataFrame(dict_data)
print_section("DataFrame")
df.show()
print_section("Schema")
df.printSchema()


 DataFrame




+---+-----------+---+-------+------+
|age| department| id|   name|salary|
+---+-----------+---+-------+------+
| 28|Engineering|  1|  Alice| 75000|
| 35|      Sales|  2|    Bob| 65000|
| 32|Engineering|  3|Charlie| 80000|
| 29|  Marketing|  4|  Diana| 70000|
| 41|      Sales|  5|    Eve| 72000|
+---+-----------+---+-------+------+


 Schema
root
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



                                                                                

In [201]:
df = spark.read.csv("/opt/spark-data/input/employees.csv", header=True, inferSchema=True)
df.show()
print_section("Schema")
df.printSchema()

                                                                                

+---+-------+---+-----------+------+
| id|   name|age| department|salary|
+---+-------+---+-----------+------+
|  1|  Alice| 28|Engineering| 75000|
|  2|    Bob| 35|      Sales| 65000|
|  3|Charlie| 32|Engineering| 80000|
|  4|  Diana| 29|  Marketing| 70000|
|  5|    Eve| 41|      Sales| 72000|
+---+-------+---+-----------+------+


 Schema
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)

