In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'H:\\Spark\\spark-3.0.0-bin-hadoop2.7'

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

In [3]:
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext = SQLContext(sc)

# Work with Spark Dataframe

All operations done in dataframe gets transformed into RDD operation at the backend.

## 1.Importing csv files as a dataframe

- Using SparkSession: `spark.read.csv()`
- Using SQLContext: `sqlcontext.read.csv()`

In [4]:
df1 = spark.read.csv('iris/iris.csv', header =  True, sep = ',')

In [5]:
print(df1)   # metadata about the the imported of objects. 

DataFrame[Sepal_Length: string, Sepal_Width: string, Petal_Length: string, Petal_Width: string, Species: string]


In [6]:
df1.show(5)

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



## 2. Convert RDD to dataframe

In [7]:
iris1 = sc.textFile('iris/iris_site.csv').map(lambda line: line.split(','))
iris1.take(5)

[['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'setosa']]

In [8]:
temp = spark.createDataFrame(iris1)
temp.show(5)

+---+---+---+---+------+
| _1| _2| _3| _4|    _5|
+---+---+---+---+------+
|5.1|3.5|1.4|0.2|setosa|
|4.9|3.0|1.4|0.2|setosa|
|4.7|3.2|1.3|0.2|setosa|
|4.6|3.1|1.5|0.2|setosa|
|5.0|3.6|1.4|0.2|setosa|
+---+---+---+---+------+
only showing top 5 rows



## 3. Convert Dataframe to RDD

Dataframe can be converted back to RDD by combining __RDD__ and __map__ object. Dataframe is converted to __rdd__ by RDD object and __map__ object places each row inside a tuple. If tuple is replaced by keyword __list__ then each row would be placed inside a list.



In [9]:
temp.rdd.map(tuple).take(5)

[('5.1', '3.5', '1.4', '0.2', 'setosa'),
 ('4.9', '3.0', '1.4', '0.2', 'setosa'),
 ('4.7', '3.2', '1.3', '0.2', 'setosa'),
 ('4.6', '3.1', '1.5', '0.2', 'setosa'),
 ('5.0', '3.6', '1.4', '0.2', 'setosa')]

In [10]:
temp.rdd.map(list).take(5)

[['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'setosa']]

## 4. Display contents of a dataframe

##### a. In table format

In [11]:
df1.show(5)

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



#### b. all contents of a dataframe as a list of rows

In [12]:
df1.take(5)   # or use collect() to show all the elements 

[Row(Sepal_Length='5.1', Sepal_Width='3.5', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.9', Sepal_Width='3.0', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.7', Sepal_Width='3.2', Petal_Length='1.3', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.6', Sepal_Width='3.1', Petal_Length='1.5', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='5.0', Sepal_Width='3.6', Petal_Length='1.4', Petal_Width='0.2', Species='setosa')]

In [13]:
df1.head(5) 

[Row(Sepal_Length='5.1', Sepal_Width='3.5', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.9', Sepal_Width='3.0', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.7', Sepal_Width='3.2', Petal_Length='1.3', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='4.6', Sepal_Width='3.1', Petal_Length='1.5', Petal_Width='0.2', Species='setosa'),
 Row(Sepal_Length='5.0', Sepal_Width='3.6', Petal_Length='1.4', Petal_Width='0.2', Species='setosa')]

## 5. Data selection (i.e., selecting particular column)

In [14]:
df1.select('Sepal_Length', 'Species').show(5)

+------------+-------+
|Sepal_Length|Species|
+------------+-------+
|         5.1| setosa|
|         4.9| setosa|
|         4.7| setosa|
|         4.6| setosa|
|         5.0| setosa|
+------------+-------+
only showing top 5 rows



## 6. Joining

In [15]:
iris1_df1 = spark.read.csv(path = 'iris\\merge\\iris_merge1.csv', sep = ',', header = True)
iris1_df1.take(5)

[Row(Sepal_Length='5.1', Sepal_Width='3.5', ID='1'),
 Row(Sepal_Length='4.9', Sepal_Width='3', ID='2'),
 Row(Sepal_Length='4.7', Sepal_Width='3.2', ID='3'),
 Row(Sepal_Length='4.6', Sepal_Width='3.1', ID='4'),
 Row(Sepal_Length='5', Sepal_Width='3.6', ID='5')]

In [16]:
iris2_df2 = spark.read.csv(path = 'iris\\merge\\iris_merge2.csv', sep = ',', header = True)
iris2_df2.take(5)

[Row(ID='1', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(ID='2', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(ID='3', Petal_Length='1.3', Petal_Width='0.2', Species='setosa'),
 Row(ID='4', Petal_Length='1.5', Petal_Width='0.2', Species='setosa'),
 Row(ID='5', Petal_Length='1.4', Petal_Width='0.2', Species='setosa')]

In [17]:
iris1_df1.join(other = iris2_df2, on = 'ID', how = 'inner').take(5)

[Row(ID='1', Sepal_Length='5.1', Sepal_Width='3.5', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(ID='2', Sepal_Length='4.9', Sepal_Width='3', Petal_Length='1.4', Petal_Width='0.2', Species='setosa'),
 Row(ID='3', Sepal_Length='4.7', Sepal_Width='3.2', Petal_Length='1.3', Petal_Width='0.2', Species='setosa'),
 Row(ID='4', Sepal_Length='4.6', Sepal_Width='3.1', Petal_Length='1.5', Petal_Width='0.2', Species='setosa'),
 Row(ID='5', Sepal_Length='5', Sepal_Width='3.6', Petal_Length='1.4', Petal_Width='0.2', Species='setosa')]

## 7. Union

Two data frames with similar structures can be joined row-wise using the union function.

In [18]:
iris1_df1 = spark.read.csv(path = 'iris\\union\\iris_union1.csv', sep = ',', header = True)
iris1_df1.take(5)

[Row(Sepal.Length='5', Sepal.Width='3', Petal.Length='1', Petal.Width='0'),
 Row(Sepal.Length='4.6', Sepal.Width=None, Petal.Length='2', Petal.Width='0.1'),
 Row(Sepal.Length='7.2', Sepal.Width='3.1', Petal.Length='5.1', Petal.Width='1'),
 Row(Sepal.Length='8', Sepal.Width='4', Petal.Length='7', Petal.Width='2')]

In [19]:
iris2_df2 = spark.read.csv(path = 'iris/union/iris_union2.csv', sep = ',', header = True)
iris2_df2.take(5)

[Row(Sepal.Length='10', Sepal.Width='6', Petal.Length='2', Petal.Width='0'),
 Row(Sepal.Length='9.2', Sepal.Width='0', Petal.Length='4', Petal.Width='0.2'),
 Row(Sepal.Length='14.4', Sepal.Width='6.2', Petal.Length='10.2', Petal.Width='2'),
 Row(Sepal.Length='16', Sepal.Width='8', Petal.Length='14', Petal.Width='4')]

In [20]:
iris1_df1.union(iris2_df2).show()

+------------+-----------+------------+-----------+
|Sepal.Length|Sepal.Width|Petal.Length|Petal.Width|
+------------+-----------+------------+-----------+
|           5|          3|           1|          0|
|         4.6|       null|           2|        0.1|
|         7.2|        3.1|         5.1|          1|
|           8|          4|           7|          2|
|          10|          6|           2|          0|
|         9.2|          0|           4|        0.2|
|        14.4|        6.2|        10.2|          2|
|          16|          8|          14|          4|
+------------+-----------+------------+-----------+



## 8. Get column names of dataframe

In [21]:
df1.columns

['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

### 9. Retrieve Schema of the Data Frame

In [22]:
df1.schema

StructType(List(StructField(Sepal_Length,StringType,true),StructField(Sepal_Width,StringType,true),StructField(Petal_Length,StringType,true),StructField(Petal_Width,StringType,true),StructField(Species,StringType,true)))

Structure for data frame can be defined with the help of StructField and StructType function. 

__StructType__ is the data type representing a Row. It consisting of a list of StructField. StructField is a field in StructType. It's arguments are

- __name__ - name of the columns
- __datatype__ - data type of the column
- __nullable__ - boolean value defining if the column is nullable or not

### 10. Display Datatype

In [23]:
df1.dtypes

[('Sepal_Length', 'string'),
 ('Sepal_Width', 'string'),
 ('Petal_Length', 'string'),
 ('Petal_Width', 'string'),
 ('Species', 'string')]

## 11. Default structure of a dataframe

When the data frame is created from an RDD it can be observed from the below result that the Data Frame has no column header.

In addition, when data is being imported from a csv file, there might be situations when a float column is defined as a string column. To overcome this issue, a structure needs to impose on the data to be imported.

In [24]:
iris1 = sc.textFile("iris/iris_site.csv").map(lambda line: line.split(","))
iris1_split = iris1.map(lambda var1: [float(var1[0]), float(var1[1]), float(var1[2]), float(var1[3]), var1[4]])
df1=spark.createDataFrame(iris1_split)
df1.show()

+---+---+---+---+------+
| _1| _2| _3| _4|    _5|
+---+---+---+---+------+
|5.1|3.5|1.4|0.2|setosa|
|4.9|3.0|1.4|0.2|setosa|
|4.7|3.2|1.3|0.2|setosa|
|4.6|3.1|1.5|0.2|setosa|
|5.0|3.6|1.4|0.2|setosa|
|5.4|3.9|1.7|0.4|setosa|
|4.6|3.4|1.4|0.3|setosa|
|5.0|3.4|1.5|0.2|setosa|
|4.4|2.9|1.4|0.2|setosa|
|4.9|3.1|1.5|0.1|setosa|
|5.4|3.7|1.5|0.2|setosa|
|4.8|3.4|1.6|0.2|setosa|
|4.8|3.0|1.4|0.1|setosa|
|4.3|3.0|1.1|0.1|setosa|
|5.8|4.0|1.2|0.2|setosa|
|5.7|4.4|1.5|0.4|setosa|
|5.4|3.9|1.3|0.4|setosa|
|5.1|3.5|1.4|0.3|setosa|
|5.7|3.8|1.7|0.3|setosa|
|5.1|3.8|1.5|0.3|setosa|
+---+---+---+---+------+
only showing top 20 rows



### 12. Defining structure for dataframe

A table contains multiple fields. So, while defining the structure of a table we need to define
- number of columns
- name of each column
- data type of each column

For a structured data, all the rows have the same structure. Here, we define the structure of the row using StructType class. It takes as an argument, a collection of StructField class objects, which is used to define the metadata about the columns in each row.

StructField takes as input
- name - name of the columns
- datatype - data type of the column
- nullable - boolean value defining if the column is nullable or not

In the below example, the structure for the iris data set is defined. It can be observed that there is a single StructType function, for which five StructField objects are passed as arguments, where each StructField corresponds to one column.



In [25]:
from pyspark.sql.types import StructType, StructField, FloatType, StringType

iris_schema = pyspark.sql.types.StructType([
    StructField('Sepal_Length', FloatType(), True),
    StructField('Sepal_Width', FloatType(), True),
    StructField('Petal_Length', FloatType(), True),
    StructField('Petal_Width', FloatType(), True),
    StructField('Species', StringType(), True),
])

print(iris_schema)

StructType(List(StructField(Sepal_Length,FloatType,true),StructField(Sepal_Width,FloatType,true),StructField(Petal_Length,FloatType,true),StructField(Petal_Width,FloatType,true),StructField(Species,StringType,true)))


### Assigning Defined Structure to Data Frame

When the data frame is created using an RDD, the defined schema can be assigned as highlighted in the below code snippet.

In [26]:
df1=spark.createDataFrame(iris1_split, iris_schema)
df1.show()

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### Importing with the defined schema

In [27]:
iris1_df1 = spark.read.csv(path = 'iris/iris.csv', sep = ',', header = True,schema=iris_schema)
iris1_df1.take(5)

[Row(Sepal_Length=5.099999904632568, Sepal_Width=3.5, Petal_Length=1.399999976158142, Petal_Width=0.20000000298023224, Species='setosa'),
 Row(Sepal_Length=4.900000095367432, Sepal_Width=3.0, Petal_Length=1.399999976158142, Petal_Width=0.20000000298023224, Species='setosa'),
 Row(Sepal_Length=4.699999809265137, Sepal_Width=3.200000047683716, Petal_Length=1.2999999523162842, Petal_Width=0.20000000298023224, Species='setosa'),
 Row(Sepal_Length=4.599999904632568, Sepal_Width=3.0999999046325684, Petal_Length=1.5, Petal_Width=0.20000000298023224, Species='setosa'),
 Row(Sepal_Length=5.0, Sepal_Width=3.5999999046325684, Petal_Length=1.399999976158142, Petal_Width=0.20000000298023224, Species='setosa')]

In [28]:
iris1_df1.dtypes

[('Sepal_Length', 'float'),
 ('Sepal_Width', 'float'),
 ('Petal_Length', 'float'),
 ('Petal_Width', 'float'),
 ('Species', 'string')]

## 13. Converting Datatype

Data type of a particular column can be changed by first selecting those columns using select function and then changing its type using the cast function.

In [29]:
iris1_df1 = spark.read.csv(path = 'iris/iris.csv', sep = ',', header = True)
iris1_df1.dtypes

[('Sepal_Length', 'string'),
 ('Sepal_Width', 'string'),
 ('Petal_Length', 'string'),
 ('Petal_Width', 'string'),
 ('Species', 'string')]

In [30]:
iris2 = iris1_df1.select(iris1_df1['Petal_Length'].cast('float'), iris1_df1['Petal_Width'].cast('float'))
iris2.show(5)

+------------+-----------+
|Petal_Length|Petal_Width|
+------------+-----------+
|         1.4|        0.2|
|         1.4|        0.2|
|         1.3|        0.2|
|         1.5|        0.2|
|         1.4|        0.2|
+------------+-----------+
only showing top 5 rows



### 14. Drop columns

In [31]:
iris1_df1 = spark.read.csv(path = 'iris/iris.csv', sep = ',', header = True,schema=iris_schema)
iris1_df1.show(5)

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



Dropping column Species

In [32]:
iris1_df1.drop('Speices').show(5)

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



## 15. Sorting Data

Data can be sorted based on a particular column of the data frame using sort function. 

In [33]:
iris1_df1 = spark.read.csv(path = 'iris/iris.csv', sep = ',', header = True, schema=iris_schema)
iris1_df1.sort('Petal_Length', ascending = False).show()

+------------+-----------+------------+-----------+---------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|  Species|
+------------+-----------+------------+-----------+---------+
|         7.7|        2.6|         6.9|        2.3|virginica|
|         7.7|        2.8|         6.7|        2.0|virginica|
|         7.7|        3.8|         6.7|        2.2|virginica|
|         7.6|        3.0|         6.6|        2.1|virginica|
|         7.9|        3.8|         6.4|        2.0|virginica|
|         7.3|        2.9|         6.3|        1.8|virginica|
|         7.4|        2.8|         6.1|        1.9|virginica|
|         7.7|        3.0|         6.1|        2.3|virginica|
|         7.2|        3.6|         6.1|        2.5|virginica|
|         6.3|        3.3|         6.0|        2.5|virginica|
|         7.2|        3.2|         6.0|        1.8|virginica|
|         7.1|        3.0|         5.9|        2.1|virginica|
|         6.8|        3.2|         5.9|        2.3|virginica|
|       

## 16. Filtering Data Based on a Condition

In [34]:
iris1_df1.select('Sepal_Length', 'Species').filter("Species == 'setosa'").show(5)

+------------+-------+
|Sepal_Length|Species|
+------------+-------+
|         5.1| setosa|
|         4.9| setosa|
|         4.7| setosa|
|         4.6| setosa|
|         5.0| setosa|
+------------+-------+
only showing top 5 rows



To filter only the flowers where 'Species' is either 'setosa' or 'versicolor'  isin function can be used as shown below.

In [35]:
iris1_df1[iris1_df1['Species'].isin(['setosa', 'versicolor'])].show()

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [36]:
buy = spark.read.csv(path = 'iris/buy.csv', sep = ',', header = True)
buy.show(5)

+---+------+------+-------+----+
|age|income|gender|marital|buys|
+---+------+------+-------+----+
| 24|130000|Female|Married|  No|
| 23|140000|Female| Single|  No|
| 27|150000|Female|Married| Yes|
| 51| 70000|Female|Married| Yes|
| 53| 50000|  Male|Married| Yes|
+---+------+------+-------+----+
only showing top 5 rows



In [37]:
# the count of customers whose age is greater than 50
buy.filter("age > 50").show()

+---+------+------+-------+----+
|age|income|gender|marital|buys|
+---+------+------+-------+----+
| 51| 70000|Female|Married| Yes|
| 53| 50000|  Male|Married| Yes|
| 56| 40000|  Male| Single|  No|
| 61| 90000|  Male|Married| Yes|
| 55|120000|Female| Single|  No|
+---+------+------+-------+----+



In [38]:
buy.filter(buy.age > 50).show()

+---+------+------+-------+----+
|age|income|gender|marital|buys|
+---+------+------+-------+----+
| 51| 70000|Female|Married| Yes|
| 53| 50000|  Male|Married| Yes|
| 56| 40000|  Male| Single|  No|
| 61| 90000|  Male|Married| Yes|
| 55|120000|Female| Single|  No|
+---+------+------+-------+----+



## 17. distinct() and Distinct Count

In [39]:
iris1_df1.select('Species').distinct().show()

+----------+
|   Species|
+----------+
| virginica|
|versicolor|
|    setosa|
+----------+



Count of values in a particular data frame can be retrieved using count function. In the below example, the count of distinct values in 'Species' column is calculated.

In [40]:
iris1_df1.select('Species').distinct().count()

3

## 18. Aggregation

Values of a particular column inside a data frame can be aggregated using `agg` function.

In [41]:
iris1_df1.agg({'Sepal_Length': "sum"}).show()

+-----------------+
|sum(Sepal_Length)|
+-----------------+
|876.4999990463257|
+-----------------+



### Aggregating Grouped Data

Aggregation for values of a particular column can be performed for several groups by combining `groupBy` with `agg` function.

In [42]:
iris1_df1.groupBy('Species').agg({'Sepal_Length':'mean'}).show()

+----------+-----------------+
|   Species|avg(Sepal_Length)|
+----------+-----------------+
| virginica|6.588000001907349|
|versicolor|5.935999975204468|
|    setosa|5.006000003814697|
+----------+-----------------+



### 19. Statistical summary of dataframe

In [44]:
iris1_df1.describe(['Sepal_length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']).show()

+-------+------------------+------------------+------------------+------------------+---------+
|summary|      Sepal_length|       Sepal_Width|      Petal_Length|       Petal_Width|  Species|
+-------+------------------+------------------+------------------+------------------+---------+
|  count|               150|               150|               150|               150|      150|
|   mean| 5.843333326975505|3.0573333358764647|3.7579999883969624|1.1993333247800668|     null|
| stddev|0.8280661128539085|0.4358662838657101|1.7652982279508533|0.7622376591453995|     null|
|    min|               4.3|               2.0|               1.0|               0.1|   setosa|
|    max|               7.9|               4.4|               6.9|               2.5|virginica|
+-------+------------------+------------------+------------------+------------------+---------+



### 20. Calculating Quantiles of data

Quantiles of a column in a dataframe can be calculated using `approxQuantile` function.

Here calculating the 490th, 60th and 80th quantile is calculated.

In [45]:
iris1_df1.approxQuantile(col = 'Sepal_Length', probabilities = [0.4, 0.6, 0.8], relativeError=0)

[5.599999904632568, 6.099999904632568, 6.5]

### 21. Multi-dimensional view of data

Looking at data in several dimensions, for eg: sales by region, sales by sales_representative, sales by month, etc. Such capability is provided in numerous decision support applications under various function names.

In [47]:
iris1_df1.cube('Species').mean().show()

+----------+-----------------+------------------+------------------+-------------------+
|   Species|avg(Sepal_Length)|  avg(Sepal_Width)| avg(Petal_Length)|   avg(Petal_Width)|
+----------+-----------------+------------------+------------------+-------------------+
|      null|5.843333326975505|3.0573333358764647|3.7579999883969624| 1.1993333247800668|
|    setosa|5.006000003814697|3.4280000066757204|1.4619999957084655|0.24600000485777854|
| virginica|6.588000001907349|2.9739999914169313| 5.551999988555909| 2.0259999775886537|
|versicolor|5.935999975204468| 2.770000009536743| 4.259999980926514| 1.3259999918937684|
+----------+-----------------+------------------+------------------+-------------------+



### 22. Correlation and covariance

In [49]:
print("Covariance: ", iris1_df1.cov('Sepal_Length', 'Petal_Length'))

Covariance:  1.274315421293779


In [50]:
print("Correlation: ", iris1_df1.corr('Sepal_Length', 'Petal_Length'))

Correlation:  0.871753784204251


### 23. Confusion Matrix

In [51]:
iris1_df1.crosstab('Species', 'Species').show()

+---------------+------+----------+---------+
|Species_Species|setosa|versicolor|virginica|
+---------------+------+----------+---------+
|      virginica|     0|         0|       50|
|         setosa|    50|         0|        0|
|     versicolor|     0|        50|        0|
+---------------+------+----------+---------+



In [15]:
sc.stop()