In [1]:
#Importation
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

#Intanciation
spark = SparkSession.builder.master("local").appName("FolksDF").getOrCreate()

In [2]:
#Importation des fichiers
Walmart = spark.read.option("header", 'true').option("delimiter", ",").csv("walmart_stock.csv")
Walmart.show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000000004|
|2012-01-10|             59.43|59.709998999999996|             5

In [3]:
#Afficher les colonnes
Walmart.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [4]:
#Le schéma
Walmart.printSchema()
#nullable = true : autorise les valeurs nulles

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



In [10]:
#Créer une nvlle colonne HV_Ratio, division de High_Price par Volume
Walmart2 = Walmart.withColumn("HV_Ratio", F.col("High")/F.col("Volume"))
Walmart2.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+--------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|            HV_Ratio|
+----------+------------------+---------+---------+------------------+--------+------------------+--------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|4.819714653321546E-6|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|6.290848613094555E-6|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|4.669412994783916E-6|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|7.367338463826307E-6|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|8.915604778943901E-6|
+----------+------------------+---------

In [15]:
#Quel jour a eu le Peak High in "Price" ?
Walmart2.createOrReplaceTempView("WalmartSQL")
#transformation du dataframe en table !

#spark.sql("""select Date from WalmartSQL order by High desc""").first() #solution 1
spark.sql("""select Date from WalmartSQL order by High desc limit 1""").show() #solution 2

+----------+
|      Date|
+----------+
|2015-01-13|
+----------+



In [18]:
#DSL en Spark
Walmart2.select(F.col("Date")) \
        .orderBy(F.col("High").desc()) \
        .head()

Row(Date='2015-01-13')

In [19]:
#Moyenne de la colonne Close
#SQL
spark.sql("""select mean(Close) as Moyenne from WalmartSQL""").show()

+--------------------------+
|avg(CAST(Close AS DOUBLE))|
+--------------------------+
|         72.38844998012726|
+--------------------------+



In [20]:
#DSL
Walmart2.select("Close") \
        .summary("mean") \
        .show()

+-------+-----------------+
|summary|            Close|
+-------+-----------------+
|   mean|72.38844998012726|
+-------+-----------------+



In [21]:
#2eme solution
Walmart2.agg(F.mean("Close").alias("Moyenne")).show()

+-----------------+
|          Moyenne|
+-----------------+
|72.38844998012726|
+-----------------+



In [22]:
#Quelles sont les valeurs max et min de Colonne ?
#SQL
spark.sql("""select max(Volume), min(Volume) from WalmartSQL""").show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|    9994400|   10010500|
+-----------+-----------+



In [25]:
#DSL
Walmart2.agg(F.max("Volume"), F.min("Volume")) \
        .show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|    9994400|   10010500|
+-----------+-----------+



In [26]:
#Combien de jours la val de Close était < à 60$ ?
#SQL
spark.sql("""select count(Date) from WalmartSQL where Close < 60""").show()

+-----------+
|count(Date)|
+-----------+
|         81|
+-----------+



In [27]:
#DSL
Walmart2.filter(F.col("Close") < '60') \
        .agg(F.count("Date")) \
        .show()
#faire l'aggregation avant le filter !

+-----------+
|count(Date)|
+-----------+
|         81|
+-----------+



In [28]:
#solution 2
Walmart2.filter(F.col("Close") < '60') \
        .count()

81

In [30]:
#Quel est le % de fois ou la val de High était > a 80$ ?
spark.sql("""select round(
                (select count(*) from WalmartSQL
                where High>80)
                /
                (count(*))
                * 100
                , 2
            )
as Percentage 
from WalmartSQL
""").show()


+----------+
|Percentage|
+----------+
|      8.43|
+----------+



In [33]:
#DSL
Temp = Walmart2.filter(F.col("High") > '80') \
        .agg(F.count("*").alias("Comptage")) \
        .collect()[0][0]

Walmart2.agg(F.round((Temp/F.count("*")*100), 2).alias("Pourcentage")) \
        .show()

+-----------+
|Pourcentage|
+-----------+
|       9.14|
+-----------+



In [34]:
#Quelle est la val max de High par année ?
#SQL
spark.sql("""select max(High), substr(Date, 1, 4) as Annee from WalmartSQL group by Annee""").show()

+---------+-----+
|max(High)|Annee|
+---------+-----+
|75.190002| 2016|
|77.599998| 2012|
|88.089996| 2014|
|81.370003| 2013|
|90.970001| 2015|
+---------+-----+



In [36]:
#DSL
#Partie 1 : creer une nvlle var Annee qui extrait l'annee depuis la var Date
Walmart3 = Walmart2.withColumn("Annee", F.substring("Date", 1, 4))
Walmart3.select("Date", "Annee").show(4)
#Partie 2 : pour chq annee , trouver le max du prix correspondant
Walmart3.groupBy(F.col("Annee")) \
        .agg(F.max("High")) \
        .show()

+----------+-----+
|      Date|Annee|
+----------+-----+
|2012-01-03| 2012|
|2012-01-04| 2012|
|2012-01-05| 2012|
|2012-01-06| 2012|
+----------+-----+
only showing top 4 rows

+-----+---------+
|Annee|max(High)|
+-----+---------+
| 2016|75.190002|
| 2012|77.599998|
| 2014|88.089996|
| 2013|81.370003|
| 2015|90.970001|
+-----+---------+

