## Bibliotecas

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

## Criação da Sessão do PySpark

In [4]:
spark = SparkSession.builder.appName("Treinamento").getOrCreate()

## Leitura da Base

In [16]:
schema = StructType([

    StructField("datetime", TimestampType(), True),
    StructField("instance_type", StringType(), True),
    StructField("os", StringType(), True),
    StructField("region", StringType(), True),
    StructField("price", DoubleType(), True)

])

In [24]:
%%time
df = spark.read.csv('../datasets/*.csv', header = False, schema = schema)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 23.6 ms


In [27]:
df.show(5)

+-------------------+-------------+----------+---------------+------+
|           datetime|instance_type|        os|         region| price|
+-------------------+-------------+----------+---------------+------+
|2017-05-08 18:46:36|   c3.8xlarge|   Windows|ap-northeast-1a|1.6503|
|2017-05-08 18:46:36|   c3.8xlarge|   Windows|ap-northeast-1c|1.7461|
|2017-05-08 18:46:34|     i3.large|SUSE Linux|ap-northeast-1c|0.1223|
|2017-05-08 18:46:34|     i3.large|Linux/UNIX|ap-northeast-1c|0.0223|
|2017-05-08 18:46:17|   c4.8xlarge|SUSE Linux|ap-northeast-1a| 0.789|
+-------------------+-------------+----------+---------------+------+
only showing top 5 rows



In [28]:
df.count()

27410309

In [29]:
# CRiando uma view temporária
df.createOrReplaceTempView("Spots")

## Conhecendo o negócio

In [30]:
# Quantas instâncias distintas temos?
df.select('instance_type').distinct().count()

68

In [31]:
df.select('instance_type').distinct().show()

+-------------+
|instance_type|
+-------------+
|     c4.large|
|   g2.2xlarge|
|    r4.xlarge|
|   r3.4xlarge|
|     t1.micro|
|    c1.medium|
|    r3.xlarge|
|    m1.xlarge|
|     m1.small|
|   i3.2xlarge|
|   g2.8xlarge|
|   c4.2xlarge|
|  cr1.8xlarge|
|     r4.large|
|   f1.2xlarge|
|    m3.xlarge|
|  p2.16xlarge|
|  i3.16xlarge|
|    p2.xlarge|
|   c4.8xlarge|
+-------------+
only showing top 20 rows



In [32]:
# Quais são os sistemas operacionais disponibilizados pela AWS?
df.select('os').distinct().show()

+----------+
|        os|
+----------+
|Linux/UNIX|
|SUSE Linux|
|   Windows|
+----------+



In [33]:
spark.sql(
    """
    SELECT DISTINCT os

    FROM Spots

    """

).show()

+----------+
|        os|
+----------+
|Linux/UNIX|
|SUSE Linux|
|   Windows|
+----------+



In [38]:
# Qual a média, max e min de preços das instâncias do tipo c3?
df.filter(df.instance_type.contains("c3")).agg(
                                                F.round(F.avg(df.price), 2).alias('price_avg'),
                                                F.max(df.price).alias('price_max'),
                                                F.min(df.price).alias('price_min')
                                                ).show()

+---------+---------+---------+
|price_avg|price_max|price_min|
+---------+---------+---------+
|     0.45|    39.28|   0.0156|
+---------+---------+---------+



In [40]:
spark.sql(
    """
    SELECT round(avg(price),2) price_avg,
           max(price) price_max,
           min(price) price_min

    FROM Spots

    WHERE instance_type like '%c3%'

    """
).show()

+---------+---------+---------+
|price_avg|price_max|price_min|
+---------+---------+---------+
|     0.45|    39.28|   0.0156|
+---------+---------+---------+



In [52]:
# Se as maquinas windows em média são mais caras que as outras?
df.groupBy('os').agg(F.avg(df.price).alias('avg')).sort(F.col('avg').desc()).show()

+----------+------------------+
|        os|               avg|
+----------+------------------+
|   Windows|1.1430238399887047|
|SUSE Linux|0.3825099062942484|
|Linux/UNIX|0.3333808063383933|
+----------+------------------+



## Aplicando regras

In [62]:
df_tratamento = df.withColumn("tipo", F.split(df['instance_type'], "\.")[0])

In [63]:
df_tratamento.show()

+-------------------+-------------+----------+---------------+------+----+
|           datetime|instance_type|        os|         region| price|tipo|
+-------------------+-------------+----------+---------------+------+----+
|2017-05-08 18:46:36|   c3.8xlarge|   Windows|ap-northeast-1a|1.6503|  c3|
|2017-05-08 18:46:36|   c3.8xlarge|   Windows|ap-northeast-1c|1.7461|  c3|
|2017-05-08 18:46:34|     i3.large|SUSE Linux|ap-northeast-1c|0.1223|  i3|
|2017-05-08 18:46:34|     i3.large|Linux/UNIX|ap-northeast-1c|0.0223|  i3|
|2017-05-08 18:46:17|   c4.8xlarge|SUSE Linux|ap-northeast-1a| 0.789|  c4|
|2017-05-08 18:46:17|   c4.8xlarge|Linux/UNIX|ap-northeast-1a| 0.689|  c4|
|2017-05-08 18:46:17|   m2.4xlarge|SUSE Linux|ap-northeast-1c|0.2782|  m2|
|2017-05-08 18:46:17|   m2.4xlarge|Linux/UNIX|ap-northeast-1c|0.1782|  m2|
|2017-05-08 18:46:10|   r3.2xlarge|SUSE Linux|ap-northeast-1c|0.2282|  r3|
|2017-05-08 18:46:10|   r3.2xlarge|Linux/UNIX|ap-northeast-1c|0.1282|  r3|
|2017-05-08 18:46:09|    

In [71]:
# Média por familia
df_type = df_tratamento.groupBy('tipo').agg(F.avg(df.price).alias('Avg')).sort(F.col('Avg').desc())

In [72]:
df_type = df_type.withColumn('os', F.lit("TESTE"))

In [86]:
# df_teste = df_tratamento.join(df_type, ['tipo'], how = 'left')
df_tratamento = df_tratamento.alias('A').join(df_type.alias('B'),
                              F.col('A.tipo') == F.col('B.tipo'),
                              how = 'left').select(
                                  [F.col('A.' + xx) for xx in df_tratamento.columns] + [F.col('B.avg')]
                              )

In [90]:
# Realizar condição que classifica um preço como ALTO ou BAIXO comparando preço e avg
df_tratamento = df_tratamento.withColumn('classificacao', F.when(df.price > df_tratamento.avg, "ALTO").otherwise("BAIXO"))


In [93]:
# Count por classificacao
df_tratamento.groupBy('classificacao').count().show()

+-------------+--------+
|classificacao|   count|
+-------------+--------+
|        BAIXO|20028455|
|         ALTO| 7381854|
+-------------+--------+



In [102]:
# Transformar dataframe em parquet
df_tratamento.limit(1000).write.mode('overwrite').partitionBy('classificacao').parquet('../datasets/tratamento.parquet')