In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
print("SparkSession created with name as 'spark'")
#spark.stop()

SparkSession created with name as 'spark'


In [2]:
df = spark.read.csv('../../data/jose_portilla/Spark_DataFrames/ContainsNull.csv',inferSchema=True,header=True)
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Drop n Fill

### Drop the missing data

You can use the .na functions for missing data. The drop command has the following parameters:

    df.na.drop(how='any', thresh=None, subset=None)
    
    * param how: 'any' or 'all'.    
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.    
    * param thresh: int, default None    
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.        
    * param subset: 
        optional list of column names to consider.

In [3]:
# Drop any row that contains missing data
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [4]:
# Has to have at least 2 NON-null values
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [5]:
df.na.drop(subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



### Fill the missing values

We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:

In [8]:
df.na.fill('Null was here').show()

+----+-------------+-----+
|  Id|         Name|Sales|
+----+-------------+-----+
|emp1|         John| null|
|emp2|Null was here| null|
|emp3|Null was here|345.0|
|emp4|        Cindy|456.0|
+----+-------------+-----+



In [9]:
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.fill('No Name',subset=['Name']).show() #specify what columns 

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [12]:
from pyspark.sql.functions import mean
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Dates and Timestamps

In [13]:
df = spark.read.csv("../../data/jose_portilla/Spark_DataFrames/appl_stock.csv",header=True,inferSchema=True)

In [14]:
df.show(4)

+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
|2010-01-06|214.379993|    215.23|        210.750004|210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|        209.050005|    210.58|119282800|          27.28265|
+----------+----------+----------+------------------+----------+---------+------------------+
only showing top 4 rows



In [19]:
from pyspark.sql.functions import col,format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format

In [29]:
df.select(dayofmonth(df['Date'])).show(2)
df.select(dayofmonth(col('Date')).alias('DayOfMonth')).show(2)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
+----------------+
only showing top 2 rows

+----------+
|DayOfMonth|
+----------+
|         4|
|         5|
+----------+
only showing top 2 rows



In [30]:
df.select(hour(df['Date'])).show(2)

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
+----------+
only showing top 2 rows



In [31]:
df.withColumn("Year",year(df['Date'])).show(2)

+----------+----------+----------+------------------+----------+---------+------------------+----+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|Year|
+----------+----------+----------+------------------+----------+---------+------------------+----+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|2010|
|2010-01-05|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|2010|
+----------+----------+----------+------------------+----------+---------+------------------+----+
only showing top 2 rows



In [32]:
newdf = df.withColumn("Year",year(df['Date']))
newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show()

+---------+------------------+
|avg(Year)|        avg(Close)|
+---------+------------------+
|   2010.0| 259.8424600000002|
|   2011.0|364.00432532142867|
|   2012.0| 576.0497195640002|
|   2013.0| 472.6348802857143|
|   2014.0| 295.4023416507935|
|   2015.0|120.03999980555547|
|   2016.0|104.60400786904763|
+---------+------------------+



In [33]:
result = newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']]
result = result.withColumnRenamed("avg(Year)","Year")
result = result.select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()

+------+----------+
|  Year|Mean Close|
+------+----------+
|2010.0|    259.84|
|2011.0|    364.00|
|2012.0|    576.05|
|2013.0|    472.63|
|2014.0|    295.40|
|2015.0|    120.04|
|2016.0|    104.60|
+------+----------+



In [6]:
df.na.replace([""], ["UNKNOWN"], "Name").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

