In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rdd').getOrCreate()
sc = spark.sparkContext

In [4]:
sc

In [5]:
data = [1,'Alice',50]
rdd = sc.parallelize(data)

In [6]:
rdd.first()

1

In [7]:
rdd.collect()

[1, 'Alice', 50]

In [8]:
rdd.take(2)

[1, 'Alice']

In [10]:
# rdd.toDF()
#TypeError: Can not infer schema for type: <class 'int'>

In [11]:
rdd2 = sc.parallelize([[1,'Alice',50],[2,'Ritu',25]])
rdd2.collect()

[[1, 'Alice', 50], [2, 'Ritu', 25]]

In [13]:
rdd2.first()

[1, 'Alice', 50]

In [15]:
df = rdd2.toDF()
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2| Ritu| 25|
+---+-----+---+



In [16]:
rdd2.count()


2

## sally

In [17]:
rdd3 = sc.parallelize([[1,'harry'],[2,'sally']])
rdd3.collect()

[[1, 'harry'], [2, 'sally']]

In [19]:
df2 = rdd3.toDF()
df2.show()

+---+----+
| _1|  _2|
+---+----+
|  1|harry|
|  2|sally|
+---+----+



In [12]:
from pyspark.sql.types import Row
data = [Row(no=1,name='Piku'),Row(no=2,name='Pupu')]

In [13]:
rdd = sc.parallelize(data)
rdd.collect()

[Row(name='Piku', no=1), Row(name='Pupu', no=2)]

In [14]:
rdd.toDF().show()

+----+---+
|name| no|
+----+---+
|Piku|  1|
|Pupu|  2|
+----+---+



In [15]:
data = [Row(no=1,name=['Piku','kilu']),Row(no=2,name=['Pupu','sally','pups'])]

In [16]:
rdd = sc.parallelize(data)
rdd.toDF().show()

+------------------+---+
|              name| no|
+------------------+---+
|      [Piku, kilu]|  1|
|[Pupu, sally, pups]|  2|
+------------------+---+



## SQLContext

In [17]:
from pyspark.sql import SQLContext
sqlcontext = SQLContext(sc)

In [18]:
df = sqlcontext.range(5)

In [19]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [20]:
df = sqlcontext.createDataFrame(data)
df.show()

+------------------+---+
|              name| no|
+------------------+---+
|      [Piku, kilu]|  1|
|[Pupu, sally, pups]|  2|
+------------------+---+



## RDD Map function

In [33]:
data = [[1,'krishan'],[2,'Sweta']]
data

[[1, 'krishan'], [2, 'Sweta']]

In [34]:
rdd = sc.parallelize(data)
rdd.collect()

[[1, 'krishan'], [2, 'Sweta']]

In [35]:
rdd.first()

[1, 'krishan']

In [37]:
col = Row('id','name')

In [39]:
rdd2 = rdd.map(lambda r:col(*r))

In [40]:
rdd2.collect()

[Row(id=1, name='krishan'), Row(id=2, name='Sweta')]

In [41]:
rdd2

PythonRDD[49] at collect at <ipython-input-40-83517eaf6d43>:1

In [43]:
rdd2.map(lambda r : Row(name = r.name+'_Subudhi')).collect()

[Row(name='krishan_Subudhi'), Row(name='Sweta_Subudhi')]