In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
print("SparkSession created with name as 'spark'")
#spark.stop()

SparkSession created with name as 'spark'


In [2]:
df = spark.read.csv('../../data/jose_portilla/Spark_DataFrames/appl_stock.csv',inferSchema=True,header=True)

In [3]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



#### Filter

In [4]:
df.filter("Close<500").show(2)

+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+----------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [5]:
df.filter('Close<500').select('open','close').show(2)

+----------+----------+
|      open|     close|
+----------+----------+
|213.429998|214.009998|
|214.599998|214.379993|
+----------+----------+
only showing top 2 rows



In [6]:
df.filter(df['close']<500).select(['open','close']).show(2)

+----------+----------+
|      open|     close|
+----------+----------+
|213.429998|214.009998|
|214.599998|214.379993|
+----------+----------+
only showing top 2 rows



In [7]:
df.filter((df['Open'] <200) & (df['close'] > 200)).show(2)

+----------+----------+------------------+----------+------------------+---------+------------------+
|      Date|      Open|              High|       Low|             Close|   Volume|         Adj Close|
+----------+----------+------------------+----------+------------------+---------+------------------+
|2010-02-12|198.109995|        201.639996|195.500002|200.37999299999998|163867200|25.961142000000002|
|2010-02-24|198.229998|201.44000400000002|197.840002|            200.66|115141600|         25.997419|
+----------+----------+------------------+----------+------------------+---------+------------------+
only showing top 2 rows



In [8]:
from pyspark.sql.functions import expr, col, column
df.filter((col('Close')<500) & (col('Close')>200) ).show(2)

+----------+----------+----------+------------------+----------+---------+------------------+
|      Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+----------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [9]:
result=df.filter((col('Close')<500) & (col('Close')>498) ).collect()
result # List of Rows

[Row(Date='2013-08-14', Open=497.88002, High=504.249992, Low=493.40002400000003, Close=498.500008, Volume=189093100, Adj Close=66.408129),
 Row(Date='2013-09-04', Open=499.560005, High=502.240013, Low=496.279984, Close=498.690025, Volume=86258200, Adj Close=66.43344300000001),
 Row(Date='2013-09-06', Open=498.43998, High=499.379974, Low=489.950012, Close=498.22000099999997, Volume=89881400, Adj Close=66.370828),
 Row(Date='2013-10-15', Open=497.510025, High=502.000008, Low=495.52002000000005, Close=498.679985, Volume=80018400, Adj Close=66.432105),
 Row(Date='2014-01-30', Open=502.539993, High=506.49997699999994, Low=496.70002, Close=499.779984, Volume=169625400, Adj Close=66.967353)]

In [10]:
row=result[0] # Row
row

Row(Date='2013-08-14', Open=497.88002, High=504.249992, Low=493.40002400000003, Close=498.500008, Volume=189093100, Adj Close=66.408129)

In [11]:
row.asDict()['Volume']

189093100

#### Agg

In [12]:
df2=spark.read.csv('../../data/jose_portilla/Spark_DataFrames/sales_info.csv',inferSchema=True,header=True)

In [13]:
df2.show(2)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+
only showing top 2 rows



In [14]:
df2.groupBy('company').mean().show()

+-------+-----------------+
|company|       avg(Sales)|
+-------+-----------------+
|   GOOG|            220.0|
|   MSFT|322.3333333333333|
|     FB|            610.0|
|   APPL|            370.0|
+-------+-----------------+



In [15]:
df2.groupBy('company').sum().show() # only sum is numeric thats why only sales is shown

+-------+----------+
|company|sum(Sales)|
+-------+----------+
|   GOOG|     660.0|
|   MSFT|     967.0|
|     FB|    1220.0|
|   APPL|    1480.0|
+-------+----------+



In [18]:
df.groupBy('Date').sum().show(5)

+----------+----------+----------+------------------+------------------+-----------+------------------+
|      Date| sum(Open)| sum(High)|          sum(Low)|        sum(Close)|sum(Volume)|    sum(Adj Close)|
+----------+----------+----------+------------------+------------------+-----------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|        214.009998|  123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|        214.379993|  150476200|27.774976000000002|
|2010-01-06|214.379993|    215.23|        210.750004|        210.969995|  138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|        209.050005|            210.58|  119282800|          27.28265|
|2010-01-08|210.299994|212.000006|209.06000500000002|211.98000499999998|  111902700|         27.464034|
+----------+----------+----------+------------------+------------------+-----------+------------------+
only showing top 5 rows



In [16]:
df2.agg({'sales':'max'}).show()

+----------+
|max(sales)|
+----------+
|     870.0|
+----------+



In [24]:
df.groupBy('date').agg({'open':'max'}).show(5)

+----------+----------+
|      date| max(open)|
+----------+----------+
|2010-01-04|213.429998|
|2010-01-05|214.599998|
|2010-01-06|214.379993|
|2010-01-07|    211.75|
|2010-01-08|210.299994|
+----------+----------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import countDistinct,avg,stddev,format_number
df2.select(stddev('sales') ).show()

+------------------+
|stddev_samp(sales)|
+------------------+
|250.08742410799007|
+------------------+



In [44]:
df2.select(stddev('sales') .alias('std')).show()

+------------------+
|               std|
+------------------+
|250.08742410799007|
+------------------+



In [43]:
df2.select(stddev('sales') ).select(format_number('stddev_samp(sales)',2).alias('std')).show()

+------+
|   std|
+------+
|250.09|
+------+



In [42]:
df2.select(format_number(stddev('sales'),2).alias('std')).show()

+------+
|   std|
+------+
|250.09|
+------+



In [55]:
from pyspark.sql.functions import desc , expr
df2.orderBy("sales").show(2)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
+-------+-------+-----+
only showing top 2 rows



In [58]:
df2.orderBy(df2['sales'].desc()).show(2)
df2.orderBy(col('sales').desc()).show(2)

+-------+------+-----+
|Company|Person|Sales|
+-------+------+-----+
|     FB|  Carl|870.0|
|   APPL|  Mike|750.0|
+-------+------+-----+
only showing top 2 rows

+-------+------+-----+
|Company|Person|Sales|
+-------+------+-----+
|     FB|  Carl|870.0|
|   APPL|  Mike|750.0|
+-------+------+-----+
only showing top 2 rows

