In [1]:
##WithColumn example
from pyspark.sql import Row
from pyspark.sql.types import *
rdd=sc.parallelize([1,2,3,4])
rdd.collect()
rowRdd=rdd.map(lambda x: Row(name=x))
schema = StructType([StructField("name", IntegerType(), False)])
df=sqlContext.createDataFrame(data=rowRdd,schema=schema)
df2=df.select('name')
df3=df.withColumn('id',df.name+2)
df3.show()

+----+---+
|name| id|
+----+---+
|   1|  3|
|   2|  4|
|   3|  5|
|   4|  6|
+----+---+



In [2]:
datafrm=sc.parallelize([Row(id='a',value1=1,value2=2),Row(id='b',value1=2,value2=3),Row(id='a',value1=3,value2=4)]).toDF()
computedOne=datafrm.withColumn('newCol',datafrm.value1*datafrm.value2)
averagedDataFrame=datafrm.groupBy('id').avg('value1')
averagedDataFrame.show()
sqlContext.registerDataFrameAsTable(df=datafrm,tableName='values')
avgDF=sqlContext.sql('select avg(value1),id from values group by id')
avgDF.show()

+---+-----------+
| id|avg(value1)|
+---+-----------+
|  b|        2.0|
|  a|        2.0|
+---+-----------+

+-----------+---+
|avg(value1)| id|
+-----------+---+
|        2.0|  b|
|        2.0|  a|
+-----------+---+



In [3]:
##Calculating previous Date in dataFrame
from datetime import datetime,timedelta
from pyspark.sql.functions import when,date_add,col
dateDF=sc.parallelize([Row(name='a',dat=datetime(2018,4,20)),
                       Row(name='b',dat=datetime(2018,4,20)),
                       Row(name='a',dat=datetime(2018,4,20))]).toDF()
addedColumn=dateDF.withColumn('prevDate',when(dateDF.dat.isNotNull(),date_add(col('dat'),-1)).otherwise(dateDF['dat']))
addedColumn.show()

+-------------------+----+-------------------+
|                dat|name|           prevDate|
+-------------------+----+-------------------+
|2018-04-20 00:00:00|   a|2018-04-19 00:00:00|
|2018-04-20 00:00:00|   b|2018-04-19 00:00:00|
|2018-04-20 00:00:00|   a|2018-04-19 00:00:00|
+-------------------+----+-------------------+



In [4]:
from pyspark.sql import functions as F
##range to generate df
rangeDF=spark.range(1,10,1)
rangeDF.show()
##data frame select and where function
filteredDF=rangeDF.select('id').where(rangeDF.id>5)
filteredDF.show()
#agg function
maxId=filteredDF.agg(F.avg(filteredDF.id))
maxId.show()
##approxquantile
#quartDF=rangeDF.approxQuantile('id',[0,1],0.5)
#quartDF.show()
rangeDF.columns

##cross join
crossDF=rangeDF.crossJoin(filteredDF)
crossDF.show()
crossDF.describe().show()
crossDF.drop('id').show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+

+---+
| id|
+---+
|  6|
|  7|
|  8|
|  9|
+---+

+-------+
|avg(id)|
+-------+
|    7.5|
+-------+

+---+---+
| id| id|
+---+---+
|  1|  6|
|  1|  7|
|  1|  8|
|  1|  9|
|  2|  6|
|  2|  7|
|  2|  8|
|  2|  9|
|  3|  6|
|  3|  7|
|  3|  8|
|  3|  9|
|  4|  6|
|  4|  7|
|  4|  8|
|  4|  9|
|  5|  6|
|  5|  7|
|  5|  8|
|  5|  9|
+---+---+
only showing top 20 rows

+-------+------------------+------------------+
|summary|                id|                id|
+-------+------------------+------------------+
|  count|                36|                36|
|   mean|               5.0|               7.5|
| stddev|2.6186146828319083|1.1338934190276817|
|    min|                 1|                 6|
|    max|                 9|                 9|
+-------+------------------+------------------+

++
||
++
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
++
only showing top 20 rows



In [5]:
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql import Row

def rowKeyFunction(x):
    return x.Salary

def ranking(x):
    y=[]
    for index,element in enumerate(x):
        dictEle=element.asDict()
        if(index==0):
            dictEle['rank']=1
        elif element.Salary==x[index-1].Salary:
            dictEle['rank']=y[index-1]['rank']
        else:
            dictEle['rank']=y[index-1]['rank']+1
        y.append(Row(**dictEle))
    return y
        
empDf=sqlContext.read.csv(path='/home/kapil/software-apps/input-files/employee-input.csv',header=True,inferSchema=True)
empDf.registerTempTable('employee')
##rdd approach
empRdd=empDf.rdd;
windowPartitonRdd=empRdd.groupBy(lambda x:x['EmpNo'])
sortedRdd=windowPartitonRdd.mapValues(lambda y:sorted(y,key=rowKeyFunction))
rankedRdd=sortedRdd.mapValues(ranking)
rowRdd=rankedRdd.values().flatMap(lambda x:x)
createdDf=rowRdd.toDF()
createdDf.show()


##by usig window function
windowSpec=Window.partitionBy(empDf['EmpNo']).orderBy(empDf['Salary'])
##with joining two dataframes
windowedDF=empDf.distinct().select(func.rank().over(windowSpec).alias('EmpRank'),empDf.EmpNo,empDf.Salary)
joinedDF=empDf.join(other=windowedDF,on=([windowedDF.EmpNo==empDf.EmpNo,windowedDF.Salary==empDf.Salary]),how='inner')
joinedDF.show()



## using with column function and dense rank
withColumnDf=empDf.withColumn(col=func.dense_rank().over(windowSpec),colName='rank')
withColumnDf.show()


##using sql
sqlDf=sqlContext.sql('select EmpNo,EmpName,Salary, dense_rank() OVER (PARTITION BY EmpNo ORDER BY Salary ASC) as rank from employee')
sqlDf.show() 



+-------+-----+------+----+
|EmpName|EmpNo|Salary|rank|
+-------+-----+------+----+
|    Som| E125|  4000|   1|
|    Som| E125|  6000|   2|
|    Tom| E123|  2000|   1|
|    Tom| E123|  2000|   1|
|    Tom| E123|  8000|   2|
|    Rom| E124|  3000|   1|
|    Rom| E124|  7000|   2|
|    Pom| E126|  5000|   1|
+-------+-----+------+----+

+-----+-------+------+-------+-----+------+
|EmpNo|EmpName|Salary|EmpRank|EmpNo|Salary|
+-----+-------+------+-------+-----+------+
| E123|    Tom|  2000|      1| E123|  2000|
| E124|    Rom|  3000|      1| E124|  3000|
| E125|    Som|  4000|      1| E125|  4000|
| E126|    Pom|  5000|      1| E126|  5000|
| E125|    Som|  6000|      2| E125|  6000|
| E124|    Rom|  7000|      2| E124|  7000|
| E123|    Tom|  8000|      2| E123|  8000|
| E123|    Tom|  2000|      1| E123|  2000|
+-----+-------+------+-------+-----+------+

+-----+-------+------+----+
|EmpNo|EmpName|Salary|rank|
+-----+-------+------+----+
| E126|    Pom|  5000|   1|
| E125|    Som|  4000|

 # Date Functions On DataFrame

In [79]:
from pyspark.sql.functions import date_format,unix_timestamp,to_date
from pyspark.sql.functions import add_months,date_add,datediff,current_date,months_between

import datetime

class Employee():
    
    def __init__(self):
        self.name=''
        self.id=0
        self.birthdate=datetime.date

    def setName(self,name):
        self.name=name
        
    def setId(self,id):
        self.id=id

    def setbirthDate(self,bd):
        self.birthdate=bd


def mapEmployeeRow(x):
    y=x.split(',')
    d=y[2].split('/')
    print(y)
    emp=Employee()
    emp.setId(y[0])
    emp.setName(y[1])
    emp.setbirthDate(datetime.date(int(d[0]),int(d[1]),int(d[2])))
    return emp

dataRdd=sc.parallelize(['111,kapil,1991/3/21','222,Amit,1986/12/10'])
dataEmployeeRdd=dataRdd.map(mapEmployeeRow)
##Need to import created class as separate file
##dataEmployeeRdd.collect()

##Data
rows=sc.parallelize([Row(id=123,name='kapil',birthdate='21-Mar-1991'),
                     Row(id=124,name='Amit',birthdate='10-Dec-1986')])
##Schema with StringType and LongType
schema=StructType([StructField('birthdate',StringType(),True),
                       StructField('id',LongType(),True),
                       StructField('name',StringType(),True)])
rowsDf=spark.createDataFrame(data=rows,schema=schema)
rowsDf.show()
##adding months after converting String to date
dateDf=rowsDf.select(add_months(to_date(unix_timestamp(rowsDf.birthdate,'dd-MMM-yyyy').cast("timestamp"),'dd-MMM-yyyy'),12).alias('date'))
dateDf.show()


rdd=sc.parallelize(['111,kapil,1991/3/21','222,Amit,1986/12/10'])
def mapRow(x):
    y=x.split(',')
    d=y[2].split('/')
    return Row(id=y[0],name=y[1],birthdate=datetime.date(int(d[0]),int(d[1]),int(d[2])))

dateDfs=spark.createDataFrame(rdd.map(mapRow))
dateDfs.show()

##Having DateType Column in Dataframe
dateRows=sc.parallelize([Row(id=123,name='kapil',birthdate=datetime.date(1991,3,21)),
                     Row(id=124,name='Amit',birthdate=datetime.date(1986,12,10))])
schema2=StructType([StructField('birthdate',DateType(),True),
                       StructField('id',LongType(),True),
                       StructField('name',StringType(),True)])

dateRowsDf=spark.createDataFrame(data=dateRows,schema=schema2)

##add months
dateRowsDf.select(add_months(dateRowsDf.birthdate,12).alias('Months_added_12')).show()
## add days
dateRowsDf.select(date_add(dateRowsDf.birthdate,30).alias('Days_added_30')).show()
## substract days
dateRowsDf.select(date_add(dateRowsDf.birthdate,-30).alias('Days_substracted_30')).show()
## diffrence between dates
dateRowsDf.select(dateRowsDf.name,(datediff(current_date(),dateRowsDf.birthdate)/365).alias('age')).show()
##months between
dateRowsDf.select(dateRowsDf.name,(months_between(current_date(),dateRowsDf.birthdate)/12).alias('age')).show()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 454.0 failed 1 times, most recent failure: Lost task 3.0 in stage 454.0 (TID 2042, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 376, in dump_stream
    bytes = self.serializer.dumps(vs)
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 555, in dumps
    return pickle.dumps(obj, protocol)
_pickle.PicklingError: Can't pickle <class '__main__.Employee'>: attribute lookup Employee on __main__ failed

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 376, in dump_stream
    bytes = self.serializer.dumps(vs)
  File "/home/kapil/software-apps/spark-2.3.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 555, in dumps
    return pickle.dumps(obj, protocol)
_pickle.PicklingError: Can't pickle <class '__main__.Employee'>: attribute lookup Employee on __main__ failed

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:939)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
