In [1]:
import os
import json
import time
from datetime import datetime
import requests

import pyspark

# for creating schemas
# from pyspark.sql.types import * 
import pyspark.sql.types as pst

In [2]:
spark = pyspark.sql.SparkSession.builder.master('local[16]').appName('test1').config('spark.driver.memory', '8g').getOrCreate()

In [3]:
recs = ['AAAA', 'BBBB', 'CCCCC', 'DDD222', 'EEEE', 'FFFFF']

In [4]:
rdd = spark.sparkContext.parallelize(recs)

In [5]:
rdd.collect()

['AAAA', 'BBBB', 'CCCCC', 'DDD222', 'EEEE', 'FFFFF']

In [6]:
rdd.take(20)

['AAAA', 'BBBB', 'CCCCC', 'DDD222', 'EEEE', 'FFFFF']

In [7]:
rdd.count()

6

In [8]:
rdd.map(lambda rec: str(rec).lower()).take(10)

['aaaa', 'bbbb', 'ccccc', 'ddd222', 'eeee', 'fffff']

In [9]:
# add an index column
idx_table_rdd = rdd.zipWithIndex().map(lambda rec: (rec[1], rec[0]))

In [10]:
idx_table_rdd.take(10)

[(0, 'AAAA'),
 (1, 'BBBB'),
 (2, 'CCCCC'),
 (3, 'DDD222'),
 (4, 'EEEE'),
 (5, 'FFFFF')]

# Dataframe 

In [11]:
# dataframe is just an rdd + schema, here i am not really specifying much schema
mydf = idx_table_rdd.toDF()

In [12]:
mydf.show()

+---+------+
| _1|    _2|
+---+------+
|  0|  AAAA|
|  1|  BBBB|
|  2| CCCCC|
|  3|DDD222|
|  4|  EEEE|
|  5| FFFFF|
+---+------+



In [13]:
# since we didnt provide schema
mydf.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)



In [14]:
mydf.select('_1').show()

+---+
| _1|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
+---+



In [15]:
mydf.select(['_1', '_2']).filter(mydf['_1'] < 4 ).show()

+---+------+
| _1|    _2|
+---+------+
|  0|  AAAA|
|  1|  BBBB|
|  2| CCCCC|
|  3|DDD222|
+---+------+



In [16]:
# You could also register temp view and supply SQL queries as string.

## Define Schema

In [17]:
# 3 things per field/column are needed: 
# column name, column type, is nullable
fields = [
    pst.StructField('idx', pst.IntegerType(), False),
    pst.StructField('val', pst.StringType(), True),
]

In [18]:
# wrap it into an StructType()
schema = pst.StructType(fields)

In [19]:
# make a df from an rdd
better_df = spark.createDataFrame(idx_table_rdd, schema)

In [20]:
better_df.show()

+---+------+
|idx|   val|
+---+------+
|  0|  AAAA|
|  1|  BBBB|
|  2| CCCCC|
|  3|DDD222|
|  4|  EEEE|
|  5| FFFFF|
+---+------+



In [21]:
better_df.select('idx', better_df['idx']+100, 'val').show()

+---+-----------+------+
|idx|(idx + 100)|   val|
+---+-----------+------+
|  0|        100|  AAAA|
|  1|        101|  BBBB|
|  2|        102| CCCCC|
|  3|        103|DDD222|
|  4|        104|  EEEE|
|  5|        105| FFFFF|
+---+-----------+------+

