In [20]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Rank, Dense Rank and Percent Rank

In [21]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import unix_timestamp, from_unixtime

In [16]:
from google.colab import files
uploads = files.upload()

Saving WMT.csv to WMT.csv


In [22]:
spark = SparkSession.builder.getOrCreate()

In [23]:
df = spark.read.csv('/content/WMT.csv', inferSchema=True, header = True)

In [24]:
df.show()

+-------------------+---------+---------+---------+---------+---------+--------+
|               Date|     Open|     High|      Low|    Close|Adj Close|  Volume|
+-------------------+---------+---------+---------+---------+---------+--------+
|2016-01-20 00:00:00|61.799999|62.330002|60.200001|    60.84|53.990601|17369100|
|2016-01-21 00:00:00|    60.98|62.790001|    60.91|61.880001|54.913509|12089200|
|2016-01-22 00:00:00|62.439999|63.259998|62.130001|62.689999|55.632324| 9197500|
|2016-01-25 00:00:00|62.779999|    63.82|62.549999|63.450001|56.306763|12823400|
|2016-01-26 00:00:00|63.360001|64.470001|63.259998|     64.0|56.794834| 9441200|
|2016-01-27 00:00:00|64.099998|    65.18|63.889999|63.950001|56.750477|10214300|
|2016-01-28 00:00:00|64.029999|64.510002|    63.43|64.220001| 56.99007|11278300|
|2016-01-29 00:00:00|    64.75|66.529999|64.739998|66.360001|58.889149|16439100|
|2016-02-01 00:00:00|65.910004|    67.93|65.889999|     67.5| 59.90081|14728400|
|2016-02-02 00:00:00|67.3000

In [25]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [29]:
df.withColumn('today_date', F.current_date()).show()

+-------------------+---------+---------+---------+---------+---------+--------+----------+
|               Date|     Open|     High|      Low|    Close|Adj Close|  Volume|today_date|
+-------------------+---------+---------+---------+---------+---------+--------+----------+
|2016-01-20 00:00:00|61.799999|62.330002|60.200001|    60.84|53.990601|17369100|2022-12-18|
|2016-01-21 00:00:00|    60.98|62.790001|    60.91|61.880001|54.913509|12089200|2022-12-18|
|2016-01-22 00:00:00|62.439999|63.259998|62.130001|62.689999|55.632324| 9197500|2022-12-18|
|2016-01-25 00:00:00|62.779999|    63.82|62.549999|63.450001|56.306763|12823400|2022-12-18|
|2016-01-26 00:00:00|63.360001|64.470001|63.259998|     64.0|56.794834| 9441200|2022-12-18|
|2016-01-27 00:00:00|64.099998|    65.18|63.889999|63.950001|56.750477|10214300|2022-12-18|
|2016-01-28 00:00:00|64.029999|64.510002|    63.43|64.220001| 56.99007|11278300|2022-12-18|
|2016-01-29 00:00:00|    64.75|66.529999|64.739998|66.360001|58.889149|16439100|

In [30]:
df.withColumn('today_date_time', F.current_timestamp()).show()

+-------------------+---------+---------+---------+---------+---------+--------+--------------------+
|               Date|     Open|     High|      Low|    Close|Adj Close|  Volume|     today_date_time|
+-------------------+---------+---------+---------+---------+---------+--------+--------------------+
|2016-01-20 00:00:00|61.799999|62.330002|60.200001|    60.84|53.990601|17369100|2022-12-18 17:39:...|
|2016-01-21 00:00:00|    60.98|62.790001|    60.91|61.880001|54.913509|12089200|2022-12-18 17:39:...|
|2016-01-22 00:00:00|62.439999|63.259998|62.130001|62.689999|55.632324| 9197500|2022-12-18 17:39:...|
|2016-01-25 00:00:00|62.779999|    63.82|62.549999|63.450001|56.306763|12823400|2022-12-18 17:39:...|
|2016-01-26 00:00:00|63.360001|64.470001|63.259998|     64.0|56.794834| 9441200|2022-12-18 17:39:...|
|2016-01-27 00:00:00|64.099998|    65.18|63.889999|63.950001|56.750477|10214300|2022-12-18 17:39:...|
|2016-01-28 00:00:00|64.029999|64.510002|    63.43|64.220001| 56.99007|11278300|20

In [31]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [33]:
df2 = df.select('Date', from_unixtime(unix_timestamp('Date','dd-MM-yyyy')).alias('new_date'))

In [34]:
df2.show()

+-------------------+-------------------+
|               Date|           new_date|
+-------------------+-------------------+
|2016-01-20 00:00:00|2016-01-20 00:00:00|
|2016-01-21 00:00:00|2016-01-21 00:00:00|
|2016-01-22 00:00:00|2016-01-22 00:00:00|
|2016-01-25 00:00:00|2016-01-25 00:00:00|
|2016-01-26 00:00:00|2016-01-26 00:00:00|
|2016-01-27 00:00:00|2016-01-27 00:00:00|
|2016-01-28 00:00:00|2016-01-28 00:00:00|
|2016-01-29 00:00:00|2016-01-29 00:00:00|
|2016-02-01 00:00:00|2016-02-01 00:00:00|
|2016-02-02 00:00:00|2016-02-02 00:00:00|
|2016-02-03 00:00:00|2016-02-03 00:00:00|
|2016-02-04 00:00:00|2016-02-04 00:00:00|
|2016-02-05 00:00:00|2016-02-05 00:00:00|
|2016-02-08 00:00:00|2016-02-08 00:00:00|
|2016-02-09 00:00:00|2016-02-09 00:00:00|
|2016-02-10 00:00:00|2016-02-10 00:00:00|
|2016-02-11 00:00:00|2016-02-11 00:00:00|
|2016-02-12 00:00:00|2016-02-12 00:00:00|
|2016-02-16 00:00:00|2016-02-16 00:00:00|
|2016-02-17 00:00:00|2016-02-17 00:00:00|
+-------------------+-------------

In [36]:
# date_add
# generateing new dates with a offset.
df2.select(F.date_add(df2.new_date,5).alias('next_date')).show()

+----------+
| next_date|
+----------+
|2016-01-25|
|2016-01-26|
|2016-01-27|
|2016-01-30|
|2016-01-31|
|2016-02-01|
|2016-02-02|
|2016-02-03|
|2016-02-06|
|2016-02-07|
|2016-02-08|
|2016-02-09|
|2016-02-10|
|2016-02-13|
|2016-02-14|
|2016-02-15|
|2016-02-16|
|2016-02-17|
|2016-02-21|
|2016-02-22|
+----------+
only showing top 20 rows



In [37]:
# data_sub , to get previous date
df2.select(F.date_sub(df2.new_date,5).alias('previous_date')).show()

+-------------+
|previous_date|
+-------------+
|   2016-01-15|
|   2016-01-16|
|   2016-01-17|
|   2016-01-20|
|   2016-01-21|
|   2016-01-22|
|   2016-01-23|
|   2016-01-24|
|   2016-01-27|
|   2016-01-28|
|   2016-01-29|
|   2016-01-30|
|   2016-01-31|
|   2016-02-03|
|   2016-02-04|
|   2016-02-05|
|   2016-02-06|
|   2016-02-07|
|   2016-02-11|
|   2016-02-12|
+-------------+
only showing top 20 rows



In [39]:
# date_format
df2.select(F.date_format('new_date','MM/dd/yyyy').alias('new_format_date')).show()

+---------------+
|new_format_date|
+---------------+
|     01/20/2016|
|     01/21/2016|
|     01/22/2016|
|     01/25/2016|
|     01/26/2016|
|     01/27/2016|
|     01/28/2016|
|     01/29/2016|
|     02/01/2016|
|     02/02/2016|
|     02/03/2016|
|     02/04/2016|
|     02/05/2016|
|     02/08/2016|
|     02/09/2016|
|     02/10/2016|
|     02/11/2016|
|     02/12/2016|
|     02/16/2016|
|     02/17/2016|
+---------------+
only showing top 20 rows



In [42]:
df2= df2.withColumn('current_date', F.current_timestamp())
df2.show()

+-------------------+-------------------+--------------------+--------------------+
|               Date|           new_date|        curretn_date|        current_date|
+-------------------+-------------------+--------------------+--------------------+
|2016-01-20 00:00:00|2016-01-20 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-21 00:00:00|2016-01-21 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-22 00:00:00|2016-01-22 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-25 00:00:00|2016-01-25 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-26 00:00:00|2016-01-26 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-27 00:00:00|2016-01-27 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-28 00:00:00|2016-01-28 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-01-29 00:00:00|2016-01-29 00:00:00|2022-12-18 17:47:...|2022-12-18 17:47:...|
|2016-02-01 00:00:00|2016-02-01 00:00:00|2022-12-18 17:47:...|2022-12-18 17:

In [43]:
# difference in the dates
df2.select(F.datediff(df2.current_date, df2.new_date)).alias('date_diff').show()

+--------------------------------+
|datediff(current_date, new_date)|
+--------------------------------+
|                            2524|
|                            2523|
|                            2522|
|                            2519|
|                            2518|
|                            2517|
|                            2516|
|                            2515|
|                            2512|
|                            2511|
|                            2510|
|                            2509|
|                            2508|
|                            2505|
|                            2504|
|                            2503|
|                            2502|
|                            2501|
|                            2497|
|                            2496|
+--------------------------------+
only showing top 20 rows



In [46]:
df2.select(F.last_day(df2.new_date).alias('last_day')).show()

+----------+
|  last_day|
+----------+
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-01-31|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
|2016-02-29|
+----------+
only showing top 20 rows

