In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
import os
import sys

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
spark = SparkSession.builder.appName('Pyspark Aula 2').getOrCreate()

# Create DataFrame from an existing RDD

In [5]:
rdd = sc.parallelize([('C',85,76,7,91), ('B',85,76,87,91), ('A',85,78,96,92), ('A',92,76,89,96)], 4)

In [6]:
type(rdd)

pyspark.rdd.RDD

In [7]:
sub = ['id_person','v_1','v_2','v_3','v_4']

In [8]:
df = spark.createDataFrame(rdd, schema=sub)

In [9]:
df.show()

+---------+---+---+---+---+
|id_person|v_1|v_2|v_3|v_4|
+---------+---+---+---+---+
|        C| 85| 76|  7| 91|
|        B| 85| 76| 87| 91|
|        A| 85| 78| 96| 92|
|        A| 92| 76| 89| 96|
+---------+---+---+---+---+



In [10]:
type(df)

pyspark.sql.dataframe.DataFrame

In [11]:
df.printSchema()

root
 |-- id_person: string (nullable = true)
 |-- v_1: long (nullable = true)
 |-- v_2: long (nullable = true)
 |-- v_3: long (nullable = true)
 |-- v_4: long (nullable = true)



# Importing data

In [12]:
df = spark.read.csv('./cereal.csv', sep=',', inferSchema=True, header=True)

In [13]:
df.show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
|Apple Cinnamon 

In [16]:
print(f'N de linhas: {df.count()}')
print(df.printSchema())

N de linhas: 77
root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)

None


# Selecting

In [18]:
df.select('name','calories','weight','rating').show(6)

+--------------------+--------+------+---------+
|                name|calories|weight|   rating|
+--------------------+--------+------+---------+
|           100% Bran|      70|   1.0|68.402973|
|   100% Natural Bran|     120|   1.0|33.983679|
|            All-Bran|      70|   1.0|59.425505|
|All-Bran with Ext...|      50|   1.0|93.704912|
|      Almond Delight|     110|   1.0|34.384843|
|Apple Cinnamon Ch...|     110|   1.0|29.509541|
+--------------------+--------+------+---------+
only showing top 6 rows



# withColumn()

In [19]:
df.withColumn('Calories', df['calories'].cast('integer')).printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



# Groupby()

In [20]:
df.groupBy('mfr').count().show()

+---+-----+
|mfr|count|
+---+-----+
|  K|   23|
|  Q|    8|
|  A|    1|
|  N|    6|
|  R|    8|
|  G|   22|
|  P|    9|
+---+-----+



# orderBy()

In [22]:
df.orderBy('calories', ascending=False).show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|Mueslix Crispy Blend|  K|   C|     160|      3|  2|   150|  3.0| 17.0|    13|   160|      25|    3|   1.5|0.67|30.313351|
|Muesli Raisins; P...|  R|   C|     150|      4|  3|   150|  3.0| 16.0|    11|   170|      25|    3|   1.0| 1.0|34.139765|
|Muesli Raisins; D...|  R|   C|     150|      4|  3|    95|  3.0| 16.0|    11|   170|      25|    3|   1.0| 1.0|37.136863|
|Just Right Fruit ...|  K|   C|     140|      3|  1|   170|  2.0| 20.0|     9|    95|     100|    3|   1.3|0.75|36.471512|
|   Total Raisin Bran|  G|   C|     140|      3|  1|   190|  4.0| 15.0|    14|   230|     100|    3|   1.5| 1.0|28.592785|
|Nutri-Grain Alm

# Case When

In [23]:
from pyspark.sql.functions import when

In [29]:
df.select('name', when(df.vitamins >= 20, 'rich in vitamins').alias('vitamins_status')).show()

+--------------------+----------------+
|                name| vitamins_status|
+--------------------+----------------+
|           100% Bran|rich in vitamins|
|   100% Natural Bran|            null|
|            All-Bran|rich in vitamins|
|All-Bran with Ext...|rich in vitamins|
|      Almond Delight|rich in vitamins|
|Apple Cinnamon Ch...|rich in vitamins|
|         Apple Jacks|rich in vitamins|
|             Basic 4|rich in vitamins|
|           Bran Chex|rich in vitamins|
|         Bran Flakes|rich in vitamins|
|        Cap'n'Crunch|rich in vitamins|
|            Cheerios|rich in vitamins|
|Cinnamon Toast Cr...|rich in vitamins|
|            Clusters|rich in vitamins|
|         Cocoa Puffs|rich in vitamins|
|           Corn Chex|rich in vitamins|
|         Corn Flakes|rich in vitamins|
|           Corn Pops|rich in vitamins|
|       Count Chocula|rich in vitamins|
|  Cracklin' Oat Bran|rich in vitamins|
+--------------------+----------------+
only showing top 20 rows



# Filter

In [31]:
df.filter(df.calories > 110).show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|             Basic 4|  G|   C|     130|      3|  2|   210|  2.0| 18.0|     8|   100|      25|    3|  1.33|0.75|37.038562|
|        Cap'n'Crunch|  Q|   C|     120|      1|  2|   220|  0.0| 12.0|    12|    35|      25|    2|   1.0|0.75|18.042851|
|Cinnamon Toast Cr...|  G|   C|     120|      1|  3|   210|  0.0| 13.0|     9|    45|      25|    2|   1.0|0.75|19.823573|
|Fruit & Fibre Dat...|  P|   C|     120|      3|  2|   160|  5.0| 12.0|    10|   200|      25|    3|  1.25|0.67|40.917047|
|       Fruitful

# isnull(), isnotnull()

In [32]:
from pyspark.sql.functions import *

In [34]:
df.filter(df.name.isNull()).show()

+----+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+------+
|name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|rating|
+----+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+------+
+----+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+------+



# Parte 2 da aula

In [35]:
from pyspark.sql.functions import col

In [36]:
spark = SparkSession.builder.appName('exemplos pyspark').getOrCreate()

In [37]:
# Carregar dados de um arquivo CSV
df = spark.read.csv("./cereal.csv", header=True, inferSchema=True)

In [38]:
count = df.count()
print(f'N linhas: {count}')

N linhas: 77


In [42]:
df_sample = df.sample(fraction=0.15, seed=42)
print(type(df_sample))
count = df_sample.count()
print(f'N linhas: {count}')
df_sample.show()

<class 'pyspark.sql.dataframe.DataFrame'>
N linhas: 9
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|             name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|          Basic 4|  G|   C|     130|      3|  2|   210|  2.0| 18.0|     8|   100|      25|    3|  1.33|0.75|37.038562|
|      Corn Flakes|  K|   C|     100|      2|  0|   290|  1.0| 21.0|     2|    35|      25|    1|   1.0| 1.0|45.863324|
|    Count Chocula|  G|   C|     110|      1|  1|   180|  0.0| 12.0|    13|    65|      25|    2|   1.0| 1.0|22.396513|
| Nut&Honey Crunch|  K|   C|     120|      2|  1|   190|  0.0| 15.0|     9|    40|      25|    2|   1.0|0.67|29.924285|
|Nutri-grain Wheat|  K|   C|      90|      3|  0|   170|  3.0| 18.0|     2|    90|      25|    3|   1.0| 1

In [47]:
average = df.select('rating').groupBy().avg().collect()[0][0]
print(f'Valor médio de calorias: {average}')

Valor médio de calorias: 42.66570498701299
