# PySpark: Data Exploratory
Once a Spark session is created, its running jobs can be monitored at the address: [Spark UI](http://localhost:4040/jobs/).

In [None]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.mllib.stat import Statistics

import numpy as np
import pandas as pd

In [2]:
%%html
<style>
th {font-size:12px}
td {font-size:12px}
p {font-size:14px}
div.highlight {font-size:14px}
</style>

## 1. PySpark dataframes

### 1.1. Creating dataframes

In [1]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

#### Manually

In [2]:
data = [
    ('Laptop', '$1000', 15),
    ('Mouse', '$20', 100),
    ('Headphone', '$50', 50),
    ('USB', None, 100)
]

schema = ['product', 'price', 'stock']

spark.createDataFrame(data, schema)

product,price,stock
Laptop,$1000,15
Mouse,$20,100
Headphone,$50,50
USB,,100


#### Reading CSV files

In [3]:
spark.read.csv('../data/finance_charts_apple.csv', header=True).limit(5)

Date,AAPL.Open,AAPL.High,AAPL.Low,AAPL.Close,AAPL.Volume,AAPL.Adjusted,dn,mavg,up,direction
2015-02-17,127.489998,128.880005,126.919998,127.830002,63152400,122.905254,106.7410523,117.9276669,129.1142814,Increasing
2015-02-18,127.629997,128.779999,127.449997,128.720001,44891700,123.760965,107.842423,118.9403335,130.0382439,Increasing
2015-02-19,128.479996,129.029999,128.330002,128.449997,37362400,123.501363,108.8942449,119.8891668,130.8840887,Decreasing
2015-02-20,128.619995,129.5,128.050003,129.5,48948400,124.510914,109.7854494,120.7635001,131.7415509,Increasing
2015-02-23,130.020004,133.0,129.660004,133.0,70974100,127.876074,110.3725162,121.7201668,133.0678174,Increasing


#### Reading Excel files
PySpark does not support reading Excel files. This can only be done indirectly with some help from Pandas.

In [4]:
df = pd.read_excel('../data/world_population.xlsx')
spark.createDataFrame(df.astype(str)).limit(5)

year,country,population
1960,Afghanistan,8996351
1961,Afghanistan,9166764
1962,Afghanistan,9345868
1963,Afghanistan,9533954
1964,Afghanistan,9731361


#### From SQL query

In [5]:
spark.sql('SELECT 7 AS number')

number
7


### 1.2. Data types
The `pyspark.sql.types` sub-module provides a various number of [data types](https://spark.apache.org/docs/latest/sql-ref-datatypes.html). The table below summarizes the most notable ones.

|Function     |Alias          |Data type  |
|-------------|---------------|-----------|
|BooleanType  |`boolean`      |Boolean    |
|IntegerType  |`int`          |Integer    |
|FloatType    |`float`        |Decimal    |
|LongType     |`long`         |Big integer|
|DecimalType  |`decimal(10,2)`|Big decimal|
|StringType   |`string`       |String     |
|DateType     |`date`         |Date       |
|TimestampType|`timestamp`    |Time       |

In [6]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [7]:
dfYoutube = spark.read.csv('../data/youtube_trending.csv', header=True)
dfYoutube.limit(5)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
2kyS6SvSYSE,2017-11-14,CaseyNeistat,22,2017-11-13T17:13:...,748374,57527,2966,15954,False,False
1ZAPwfrtAFY,2017-11-14,LastWeekTonight,24,2017-11-13T07:30:...,2418783,97185,6146,12703,False,False
5qpjK5DgCt4,2017-11-14,Rudy Mancuso,23,2017-11-12T19:05:...,3191434,146033,5339,8181,False,False
puqaWrEC7tY,2017-11-14,Good Mythical Mor...,24,2017-11-13T11:00:...,343168,10172,666,2146,False,False
d380meD0W0M,2017-11-14,nigahiga,24,2017-11-12T18:01:...,2095731,132235,1989,17518,False,False


In [8]:
dfYoutube.dtypes

[('video_id', 'string'),
 ('trending_date', 'string'),
 ('channel_title', 'string'),
 ('category_id', 'string'),
 ('publish_time', 'string'),
 ('views', 'string'),
 ('likes', 'string'),
 ('dislikes', 'string'),
 ('comment_count', 'string'),
 ('comments_disabled', 'string'),
 ('ratings_disabled', 'string')]

#### String

In [9]:
dfYoutube.select(F.col('video_id').cast('string')).dtypes

[('video_id', 'string')]

#### Numeric

In [10]:
dfYoutube.select(
    F.col('views').cast('int'),
    F.col('likes').cast('decimal(10,0)')
).printSchema()

root
 |-- views: integer (nullable = true)
 |-- likes: decimal(10,0) (nullable = true)



#### Boolean

In [11]:
dfYoutube.select(F.col('comments_disabled').cast('boolean')).dtypes

[('comments_disabled', 'boolean')]

#### Date and time

In [12]:
dfYoutube.select(
    F.to_date('trending_date', 'yyyy-MM-dd').alias('trending_date'),
    F.to_timestamp('publish_time').alias('publish_time'),
    F.unix_timestamp('trending_date', 'yyyy-MM-dd').alias('unix')
).limit(3)

trending_date,publish_time,unix
2017-11-14,2017-11-14 00:13:01,1510592400
2017-11-14,2017-11-13 14:30:00,1510592400
2017-11-14,2017-11-13 02:05:24,1510592400


## 2. Data exploratory

### 2.1. Overview

In [13]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [14]:
dfYoutube = spark.read.csv('../data/youtube_trending.csv', header=True)
dfYoutube.limit(5)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
2kyS6SvSYSE,2017-11-14,CaseyNeistat,22,2017-11-13T17:13:...,748374,57527,2966,15954,False,False
1ZAPwfrtAFY,2017-11-14,LastWeekTonight,24,2017-11-13T07:30:...,2418783,97185,6146,12703,False,False
5qpjK5DgCt4,2017-11-14,Rudy Mancuso,23,2017-11-12T19:05:...,3191434,146033,5339,8181,False,False
puqaWrEC7tY,2017-11-14,Good Mythical Mor...,24,2017-11-13T11:00:...,343168,10172,666,2146,False,False
d380meD0W0M,2017-11-14,nigahiga,24,2017-11-12T18:01:...,2095731,132235,1989,17518,False,False


#### Features analysis

In [15]:
dfYoutube.count()

9881

In [16]:
dfYoutube.columns

['video_id',
 'trending_date',
 'channel_title',
 'category_id',
 'publish_time',
 'views',
 'likes',
 'dislikes',
 'comment_count',
 'comments_disabled',
 'ratings_disabled']

In [17]:
dfYoutube.dtypes

[('video_id', 'string'),
 ('trending_date', 'string'),
 ('channel_title', 'string'),
 ('category_id', 'string'),
 ('publish_time', 'string'),
 ('views', 'string'),
 ('likes', 'string'),
 ('dislikes', 'string'),
 ('comment_count', 'string'),
 ('comments_disabled', 'string'),
 ('ratings_disabled', 'string')]

#### Statistical summary

In [18]:
dfYoutube.summary()

summary,video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
count,9791,9881,9881,9881.0,9881,9881.0,9881.0,9881.0,9881.0,9881,9881
mean,,,,19.9703471308572,,1288899.8444489425,47460.88563910536,3066.993320514118,5817.3166683534055,,
stddev,,,,7.544083075116899,,5110157.363651556,170513.18455936198,38892.08334635203,30624.86865832553,,
min,-0NYY8cqdiQ,2017-11-14,12 News,1.0,2008-06-17T00:07:...,10001.0,0.0,0.0,0.0,False,False
25%,,,,17.0,,89594.0,2031.0,85.0,273.0,,
50%,,,,24.0,,311117.0,9002.0,324.0,1031.0,,
75%,,,,25.0,,1000754.0,29424.0,1130.0,3374.0,,
max,zy0b9e40tK8,2018-02-01,wdwmagic,43.0,2018-01-31T17:44:...,99999.0,99980.0,998.0,99980.0,True,True


In [19]:
dfYoutube = dfYoutube.withColumn('likes', F.col('likes').cast('int')).withColumn('views', F.col('views').cast('int'))
dfYoutube.corr('likes', 'views')

0.8804327774530825

In [20]:
from pyspark.mllib.stat import Statistics
x = dfYoutube.select('comment_count', 'views', 'likes', 'dislikes').rdd.map(lambda row: row[0:])
Statistics.corr(x)

array([[1.        , 0.77526237, 0.85024311, 0.75684277],
       [0.77526237, 1.        , 0.88043278, 0.6734562 ],
       [0.85024311, 0.88043278, 1.        , 0.53002173],
       [0.75684277, 0.6734562 , 0.53002173, 1.        ]])

In [21]:
from pyspark.mllib.stat import Statistics
import pandas as pd

def computeCorrelationMatrix(df, method='pearson'):
    dfRdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(dfRdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat, columns=df.columns,  index=df.columns)
    return corr_mat_df

In [22]:
computeCorrelationMatrix(dfYoutube.select('comment_count', 'views', 'likes', 'dislikes'))

Unnamed: 0,comment_count,views,likes,dislikes
comment_count,1.0,0.775262,0.850243,0.756843
views,0.775262,1.0,0.880433,0.673456
likes,0.850243,0.880433,1.0,0.530022
dislikes,0.756843,0.673456,0.530022,1.0


#### Missing values

In [23]:
data = [
    ('Laptop', '$1000', None),
    ('Mouse', '$20', 100),
    ('Headphone', '$50', 50),
    ('USB', None, None)
]

schema = ['product', 'price', 'stock']

dfProduct = spark.createDataFrame(data, schema)

In [23]:
dfProduct

product,price,stock
Laptop,$1000,
Mouse,$20,100.0
Headphone,$50,50.0
USB,,


In [24]:
dfProduct.where(F.col('stock').isNull())

product,price,stock
Laptop,$1000,
USB,,


In [25]:
dfProduct.select(*(F.sum(F.col(c).isNull().cast('int')).alias(c) for c in dfProduct.columns))

product,price,stock
0,1,2


### 2.2. In-depth exploring

In [26]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [27]:
dfYoutube = spark.read.csv('../data/youtube_trending.csv', header=True)
dfYoutube.limit(5)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
2kyS6SvSYSE,2017-11-14,CaseyNeistat,22,2017-11-13T17:13:...,748374,57527,2966,15954,False,False
1ZAPwfrtAFY,2017-11-14,LastWeekTonight,24,2017-11-13T07:30:...,2418783,97185,6146,12703,False,False
5qpjK5DgCt4,2017-11-14,Rudy Mancuso,23,2017-11-12T19:05:...,3191434,146033,5339,8181,False,False
puqaWrEC7tY,2017-11-14,Good Mythical Mor...,24,2017-11-13T11:00:...,343168,10172,666,2146,False,False
d380meD0W0M,2017-11-14,nigahiga,24,2017-11-12T18:01:...,2095731,132235,1989,17518,False,False


#### Selection

In [28]:
dfYoutube.likes

Column<'likes'>

In [29]:
dfYoutube.select(F.col('views')).limit(3)

views
748374
2418783
3191434


In [30]:
dfYoutube.select('likes', 'dislikes').limit(3)

likes,dislikes
57527,2966
97185,6146
146033,5339


In [31]:
dfYoutube.selectExpr('likes - dislikes AS like_dislike_diff').limit(3)

like_dislike_diff
54561.0
91039.0
140694.0


In [32]:
dfYoutube.collect()[5]

Row(video_id='gHZ1Qz0KiKM', trending_date='2017-11-14', channel_title='iJustine', category_id='28', publish_time='2017-11-13T19:07:23.000Z', views='119180', likes='9763', dislikes='511', comment_count='1434', comments_disabled='False', ratings_disabled='False')

In [33]:
dfYoutube.collect()[2].channel_title

'Rudy Mancuso'

#### Filtering

In [34]:
dfYoutube.filter(F.col('views').cast('int')>100e6)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
TyHvyGVs42U,2017-11-26,LuisFonsiVEVO,10,2017-11-17T05:00:...,102012605,2376636,117196,134224,False,False
FlsCjmMhFmw,2017-12-10,YouTube Spotlight,24,2017-12-06T17:58:...,100911567,2656659,1353647,682890,False,False
FlsCjmMhFmw,2017-12-11,YouTube Spotlight,24,2017-12-06T17:58:...,113874632,2811215,1470383,787174,False,False
FlsCjmMhFmw,2017-12-12,YouTube Spotlight,24,2017-12-06T17:58:...,125432237,2912702,1545015,807558,False,False
FlsCjmMhFmw,2017-12-13,YouTube Spotlight,24,2017-12-06T17:58:...,137843120,3014471,1602383,817582,False,False
FlsCjmMhFmw,2017-12-14,YouTube Spotlight,24,2017-12-06T17:58:...,149376127,3093544,1643059,810698,False,False


In [35]:
dfYoutube.filter(
    F.col('channel_title').contains('VEVO') &
    (F.col('views').cast('int')>80e6)
)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
TyHvyGVs42U,2017-11-24,LuisFonsiVEVO,10,2017-11-17T05:00:...,80605857,2173715,104121,122511,False,False
TyHvyGVs42U,2017-11-25,LuisFonsiVEVO,10,2017-11-17T05:00:...,91552137,2277789,110752,128071,False,False
TyHvyGVs42U,2017-11-26,LuisFonsiVEVO,10,2017-11-17T05:00:...,102012605,2376636,117196,134224,False,False


In [36]:
dfYoutube.filter('channel_title LIKE "%VEVO%" AND views > 80e6').limit(3)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
TyHvyGVs42U,2017-11-24,LuisFonsiVEVO,10,2017-11-17T05:00:...,80605857,2173715,104121,122511,False,False
TyHvyGVs42U,2017-11-25,LuisFonsiVEVO,10,2017-11-17T05:00:...,91552137,2277789,110752,128071,False,False
TyHvyGVs42U,2017-11-26,LuisFonsiVEVO,10,2017-11-17T05:00:...,102012605,2376636,117196,134224,False,False


In [37]:
dfYoutube.filter(F.col('video_id').isNull()).limit(3)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
,2017-11-14,Animal Adventure ...,15,2017-11-12T00:18:...,45455,2282,35,17,False,False
,2017-11-15,Animal Adventure ...,15,2017-11-12T00:18:...,45965,2284,35,17,False,False
,2017-11-16,Mental Floss,27,2017-11-15T16:00:...,21740,979,5,93,False,False


#### Sorting

In [38]:
dfYoutube.sort(
    F.col('trending_date').cast('date').asc(),
    F.col('views').cast('int').asc()
).limit(5)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
qg0GdM60syI,2017-11-14,90s Commercials,27,2017-03-31T21:46:...,773,2,0,0,False,False
NZFhMSgbKKM,2017-11-14,Ben Rohrbach,17,2017-11-13T15:11:...,945,7,5,8,False,False
4d07RXYLsJE,2017-11-14,Jenny Hanell,28,2017-11-04T20:48:...,1827,3,0,2,False,False
wRGldR_SQAA,2017-11-14,Steve Kovach,22,2017-11-09T18:01:...,2259,0,0,0,False,False
zNqCVTs38nU,2017-11-14,D3sports.com,17,2017-11-13T02:15:...,4569,35,18,19,False,False


#### Unique values

In [39]:
dfYoutube.select('comments_disabled', 'ratings_disabled').distinct()

comments_disabled,ratings_disabled
False,True
True,True
True,False
False,False


In [40]:
dfYoutube.select(F.countDistinct('comments_disabled', 'ratings_disabled'))

"count(DISTINCT comments_disabled, ratings_disabled)"
4


#### Data aggregation

In [41]:
dfYoutube.limit(5)

video_id,trending_date,channel_title,category_id,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled
2kyS6SvSYSE,2017-11-14,CaseyNeistat,22,2017-11-13T17:13:...,748374,57527,2966,15954,False,False
1ZAPwfrtAFY,2017-11-14,LastWeekTonight,24,2017-11-13T07:30:...,2418783,97185,6146,12703,False,False
5qpjK5DgCt4,2017-11-14,Rudy Mancuso,23,2017-11-12T19:05:...,3191434,146033,5339,8181,False,False
puqaWrEC7tY,2017-11-14,Good Mythical Mor...,24,2017-11-13T11:00:...,343168,10172,666,2146,False,False
d380meD0W0M,2017-11-14,nigahiga,24,2017-11-12T18:01:...,2095731,132235,1989,17518,False,False


In [42]:
columns = ['views', 'likes', 'dislikes', 'comment_count']
for c in columns:
    youtube = dfYoutube.withColumn(c, F.col(c).cast('int'))

In [43]:
dfYoutube.groupby('category_id').count().limit(5)

category_id,count
15,199
29,15
22,798
28,546
43,10


In [44]:
dfYoutube.groupby('category_id').agg(
    F.mean('likes'), F.sum('likes'), F.max('likes'),
    F.stddev('views'), F.variance('views'), F.skewness('views'), F.kurtosis('views')
).limit(5)

category_id,avg(likes),sum(likes),max(likes),stddev_samp(views),var_samp(views),skewness(views),kurtosis(views)
15,20438.45226130653,4067252.0,9939,1053910.929171636,1.110728246627421...,3.743903644091936,14.581943331932449
29,133141.86666666667,1997128.0,986,6267908.628900739,3.928667858024835E13,3.4743735320694227,10.071331820156791
22,32485.075187969924,25923090.0,9857,1536250.770096244,2.360066428621303E12,4.172345303282912,21.43860099370683
28,25856.66483516484,14117739.0,99783,1611070.241800707,2.595547324015788...,4.020001503753627,18.47179940851185
43,4410.8,44108.0,4739,21735.7763506774,4.724439735666669E8,-1.0559781645594817,0.1217798216966752


In [45]:
dfYoutube.groupby('category_id').agg({
    'likes': 'mean', 'likes': 'sum', 'likes': 'max',
    'views': 'stddev', 'views': 'variance', 'views': 'skewness', 'views': 'kurtosis'
}).limit(5)

category_id,kurtosis(views),max(likes)
15,14.581943331932449,9939
29,10.071331820156791,986
22,21.43860099370683,9857
28,18.47179940851185,99783
43,0.1217798216966752,4739


In [46]:
dfYoutube\
    .groupby('comments_disabled', 'ratings_disabled')\
    .pivot('category_id')\
    .count()

comments_disabled,ratings_disabled,1,10,15,17,19,2,20,22,23,24,25,26,27,28,29,43
False,True,,,,,,7.0,,10.0,,,,,,,,
True,True,7.0,8.0,,7.0,,,,,,4.0,1.0,,,1.0,4.0,
True,False,,,,,,,7.0,24.0,2.0,43.0,57.0,,5.0,21.0,,
False,False,533.0,1606.0,199.0,475.0,98.0,102.0,78.0,764.0,925.0,2362.0,726.0,913.0,347.0,524.0,11.0,10.0
