In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()
spark

In [3]:
from pyspark.sql.types import *

In [4]:
dept = [('Finance',10), ('Marketing', 20), ("Sales", 30), ('It', 40)]
rdd = spark.sparkContext.parallelize(dept)

In [5]:
df = rdd.toDF()
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [6]:
df.show(truncate = False)

+---------+---+
|_1       |_2 |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|It       |40 |
+---------+---+



In [7]:
deptColumns = ['dept_name', 'dept_id']
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate = False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|It       |40     |
+---------+-------+



In [8]:
deptDF = spark.createDataFrame(rdd, schema= deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|It       |40     |
+---------+-------+



In [9]:
deptSchema = StructType([StructField('dept_name', StringType(), True),
                         StructField('dept_name', StringType(), True)])
deptDF = spark.createDataFrame(rdd, schema= deptSchema)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_name: string (nullable = true)

+---------+---------+
|dept_name|dept_name|
+---------+---------+
|Finance  |10       |
|Marketing|20       |
|Sales    |30       |
|It       |40       |
+---------+---------+



In [10]:
data = [('James', 3000),('Anna', 4001), ('Robert', 6200)]
df = spark.createDataFrame(data, ['name', 'salary'])
df.show()

+------+------+
|  name|salary|
+------+------+
| James|  3000|
|  Anna|  4001|
|Robert|  6200|
+------+------+



In [11]:
rdd = df.rdd
rdd.collect()

[Row(name='James', salary=3000),
 Row(name='Anna', salary=4001),
 Row(name='Robert', salary=6200)]

In [12]:
# Appy map() transformation 
rdd2 = df.rdd.map(lambda x: [x[0], x[1]*20/100])
print(rdd2.collect())

[['James', 600.0], ['Anna', 800.2], ['Robert', 1240.0]]


In [13]:
# Convert back to dataframe

df2 = rdd2.toDF(['name', 'bonus'])
df2.show()

+------+------+
|  name| bonus|
+------+------+
| James| 600.0|
|  Anna| 800.2|
|Robert|1240.0|
+------+------+



In [14]:
spark.stop()