In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder.appName("create_df_with_datatype").getOrCreate()

data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("middele name", StringType(), True),
    StructField("last name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)        
display(df)

name,middele name,last name,age,gender,salary
James,,Smith,36636.0,M,3000
Michael,Rose,,40288.0,M,4000
Robert,,Williams,42114.0,M,4000
Maria,Anne,Jones,39192.0,F,4000
Jen,Mary,Brown,,F,-1


In [0]:
df.dtypes

Out[3]: [('name', 'string'),
 ('middele name', 'string'),
 ('last name', 'string'),
 ('age', 'string'),
 ('gender', 'string'),
 ('salary', 'int')]

In [0]:
#Read csv file
iris_df = spark.read.csv("/FileStore/tables/Iris.csv", header=True, inferSchema=True)
# check datatype of csv column
iris_df.dtypes

Out[4]: [('Id', 'int'),
 ('SepalLengthCm', 'double'),
 ('SepalWidthCm', 'double'),
 ('PetalLengthCm', 'double'),
 ('PetalWidthCm', 'double'),
 ('Species', 'string')]

In [0]:
#Structtype is collection of structfield used to define the structure of the DataFrame
#StructField is used to define the column name, data type, and a flag for nullable or not
#Pass new schema and change the datatype of csv column
new_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("SepalLengthCm", IntegerType(), True),
    StructField("SepalWidthCm", StringType(), True),
    StructField("PetalLengthCm", StringType(), False),
    StructField("PetalWidthCm", StringType(), True),
    StructField("Species", StringType(), True),
   
])
iris_df1 = spark.read.csv("/FileStore/tables/Iris.csv", header=True, schema=new_schema)
iris_df1.dtypes


Out[5]: [('Id', 'int'),
 ('SepalLengthCm', 'int'),
 ('SepalWidthCm', 'string'),
 ('PetalLengthCm', 'string'),
 ('PetalWidthCm', 'string'),
 ('Species', 'string')]

In [0]:
iris_df1.printSchema()
iris_df2 = spark.createDataFrame(iris_df1.rdd, new_schema)
iris_df2.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: integer (nullable = true)
 |-- SepalWidthCm: string (nullable = true)
 |-- PetalLengthCm: string (nullable = true)
 |-- PetalWidthCm: string (nullable = true)
 |-- Species: string (nullable = true)

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: integer (nullable = true)
 |-- SepalWidthCm: string (nullable = true)
 |-- PetalLengthCm: string (nullable = false)
 |-- PetalWidthCm: string (nullable = true)
 |-- Species: string (nullable = true)



In [0]:
#Column Name "name" consist of nested columns firstname, middlename, lastname
data1 = [(("James", "", "Smith"), "36636", "M", 3100),
        (("Michael", "Rose", ""), "40288", "M", 4300),
        (("Robert", "", "Williams"), "42114", "M", 1400),
        (("Maria", "Anne", "Jones"), "39192", "F", 5500),
        (("Jen", "Mary", "Brown"), "", "F", -1)]
schema1= StructType([
    StructField("name", StructType([
        StructField("first_name", StringType(), True),
        StructField("middle_name", StringType(), True),
        StructField("last_name", StringType(), True)
    ])),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)  
    ])

df2 = spark.createDataFrame(data1, schema1)
df2.printSchema()
df2.show()

root
 |-- name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3100|
|   {Michael, Rose, }|40288|     M|  4300|
|{Robert, , Williams}|42114|     M|  1400|
|{Maria, Anne, Jones}|39192|     F|  5500|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+

