### Creating Dataframes Manually

###1. Using range() function of sparkSession

In [0]:
from pyspark.sql.functions import col,rand
from pyspark.sql.types import IntegerType,DoubleType,StringType,BooleanType,StructType,StructField
from pyspark.sql import Row

In [0]:
df = spark.range(0,10*100*1000)\
      .withColumn('id',(col('id')/1000).cast('integer'))\
      .withColumn('v',(rand()*100).cast('integer'))
display(df.take(10))

id,v
0,78
0,24
0,70
0,47
0,6
0,96
0,49
0,56
0,47
0,43


### 2. Using toDF method of rdd

In [0]:
seq_data = [
  (2,'Neo'),
  (3,'Alice'),
  (4,'John'),
  (5,'Mark')
]

df2 = spark.sparkContext.parallelize(seq_data).toDF(['id','name'])
display(df2)

id,name
2,Neo
3,Alice
4,John
5,Mark


In [0]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



### 3. Creating Data from collection (list or set)

In [0]:
df3 = spark.createDataFrame([1,2,3,4,56,7,32,64,22],IntegerType())
display(df3)

value
1
2
3
4
56
7
32
64
22


### 4. Creating DataFrame from collection of Rows

In [0]:
Employee = Row('id','name','department')

employees = [
  Employee(322,'Roger Daniel','IT'),
  Employee(654,'Rosa Fell','HR'),
  Employee(644,'Lisa Barber','Sales')
]

df4 = spark.createDataFrame(employees)
display(df4)

id,name,department
322,Roger Daniel,IT
654,Rosa Fell,HR
644,Lisa Barber,Sales


### 5. Creating dataframe and passing schema explicitly

In [0]:
schema_1 = StructType([
  StructField('id',IntegerType()),
  StructField('sku_no',IntegerType()),
  StructField('desc',StringType()),
  StructField('unit',DoubleType())
])

lst_data = [(1,432,'service - warranty',1.00),
            (2,454,'product - hard disk',43.00),
            (4,411,'product - printer',32.00)]
df5_1 = spark.createDataFrame(lst_data,schema_1)
display(df5_1)

id,sku_no,desc,unit
1,432,service - warranty,1.0
2,454,product - hard disk,43.0
4,411,product - printer,32.0


### 6. Creating Dataframe from collection of dictionaries

In [0]:
data = [
  {'code':'INFT','description':'Internal Fund Transfer'},
  {'code':'RCHG','description':'Recharge'},
  {'code':'LNPY','description':'linked loan payment'}
]
df6 = spark.createDataFrame(data)
display(df6)

code,description
INFT,Internal Fund Transfer
RCHG,Recharge
LNPY,linked loan payment
