In [None]:
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf

sparkContext = SparkContext(conf=SparkConf())
hiveContext = HiveContext(sparkContext)

In [None]:
rectangleList = [('RectangleA', 10, 20, 30), ('RectangleB', 40, 50, 60), ('RectangleC', 70, 80, 90)]

In [None]:
rectangleDataFrame = hiveContext.createDataFrame(rectangleList, ['RectangleName', 'Height', 'Width', 'DegreeOfRotation'])

In [47]:
#User defined function for calculating the diagonal of a rectangle

from math import *

def calculateDiagonal(height, width):
    
    return sqrt(height * height + width * width)

#Define UDF for calculating the length of the diagonal

diagonalCalculator = udf(calculateDiagonal, FloatType())

#Append the length of the diagonal as a new column to the original data frame 

rectangleDataFrame = rectangleDataFrame.withColumn("Diagonal", diagonalCalculator(rectangleDataFrame['Height'],
                                                                                  rectangleDataFrame['Width']))

rectangleDataFrame.take(1)

[Row(RectangleName=u'RectangleA', Height=10, Width=20, DegreeOfRotation=30, Diagonal=22.360679626464844)]

In [None]:
#User defined function to generate clockwise rotation matrix from the degree of rotation

from pyspark.mllib.linalg import Matrix, Matrices

#User defined function for generating the clockwise rotation matrix

def generateClockwiseRotationMatrix(theta):
    
    return Matrices.dense(2, 2, [cos(theta), sin(theta), -1.0 * sin(theta), cos(theta)])


In [48]:
#Append the clockwise rotation matrix as a new column to the original data frame

#Procedural programmer's way of thinking

collectedRows = rectangleDataFrame.select('RectangleName', 'DegreeOfRotation').rdd.collect()

from pyspark.sql import Row

newColumnName = 'ClockwiseRotationMatrix'

#Add the new clockwise rotation matrix column to each collected row

newRows = [row + Row(newColumnName=generateClockwiseRotationMatrix(row[1])) for row in collectedRows]

#Parallelize the new rows to get a new RDD. Number of partition can be specified optionally but skipped here.

newRDD = sparkContext.parallelize(newRows)

#Create a new data frame with three columns

newDataFrame = hiveContext.createDataFrame(newRDD, ['RectangleName', 'DegreeOfRotation', newColumnName])

#Join the new data frame to the original data frame and drop the extra columns which got repeated.

newRectangleDataFrame = rectangleDataFrame.join(
    newDataFrame, rectangleDataFrame['RectangleName'] == newDataFrame['RectangleName'], 'inner').drop(
    newDataFrame['RectangleName']).drop(newDataFrame['DegreeOfRotation'])

newRectangleDataFrame.take(1)

[Row(RectangleName=u'RectangleA', Height=10, Width=20, DegreeOfRotation=30, Diagonal=22.360679626464844, ClockwiseRotationMatrix=DenseMatrix(2, 2, [0.1543, -0.988, 0.988, 0.1543], False))]

In [49]:
#Spark programmer's way of thinking

columnIndex = rectangleDataFrame.columns.index('DegreeOfRotation')

newColumnList = rectangleDataFrame.columns
newColumnList.insert(len(newColumnList), newColumnName)

#Append the clockwise rotation matrix to each row of the original data frame which returns a pipeline RDD, convert the
#pipeline RDD to a new data frame and impose the updated column names.

newRectangleDataFrame = rectangleDataFrame.map(lambda row:
        (row + Row(generateClockwiseRotationMatrix(row[columnIndex])))).toDF(newColumnList)

newRectangleDataFrame.take(1)

[Row(RectangleName=u'RectangleA', Height=10, Width=20, DegreeOfRotation=30, Diagonal=22.360679626464844, ClockwiseRotationMatrix=DenseMatrix(2, 2, [0.1543, -0.988, 0.988, 0.1543], False))]