# DataFrame

http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark-sql-module

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName('DataFrame_1') \
    .master('local[*]') \
    .getOrCreate()

In [2]:
sc = spark.sparkContext

In [3]:
data_path = ''

# Wczytanie DataFrame bezposrednio z JSON oraz CSV
people = spark.read.json(data_path+'people.json')
employees = spark.read.json(data_path+'employees.json')
people_txt = spark.read.option("inferSchema", "true").csv(data_path+'people.txt')

In [4]:
people_txt.show()

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
|   Andy|30.0|
| Justin|19.0|
+-------+----+



****

### DataFrame to lista wierszy

In [5]:
newPerson1 = Row(name='Greg', age=32)

In [6]:
newPerson1

Row(name='Greg', age=32)

In [7]:
newPerson1.name

'Greg'

In [8]:
newPerson1.age

32

In [9]:
newPerson1['age']

32

In [10]:
'age' in newPerson1

True

In [11]:
newPerson = Row("age", "name")

In [12]:
newPerson2 = newPerson(24, 'Alice')

In [13]:
newPerson2

Row(age=24, name='Alice')

In [14]:
newPerson3 = newPerson(None, None)
newPerson4 = newPerson(33, None)
newPerson5 = newPerson(None, 'Peter')
newPerson6 = newPerson(32, 'Peter')
newPerson7 = newPerson(40, 'Greg')

In [15]:
newPeopleDF = spark.createDataFrame([newPerson2, newPerson3, newPerson4, newPerson5, newPerson6, newPerson7])

In [16]:
newPeopleDF.show()

+----+-----+
| age| name|
+----+-----+
|  24|Alice|
|null| null|
|  33| null|
|null|Peter|
|  32|Peter|
|  40| Greg|
+----+-----+



### Przechodzenie RDD <-> DF

In [17]:
type(people)

pyspark.sql.dataframe.DataFrame

In [18]:
people

DataFrame[age: bigint, name: string]

In [19]:
people.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

DF -> RDD 

In [20]:
type(people.rdd)

pyspark.rdd.RDD

In [21]:
people.rdd.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [22]:
people.rdd.map(tuple).collect()

[(None, 'Michael'), (30, 'Andy'), (19, 'Justin')]

RDD -> DF

In [23]:
people.rdd.toDF()

DataFrame[age: bigint, name: string]

In [24]:
people.rdd.map(tuple).toDF().collect()

[Row(_1=None, _2='Michael'), Row(_1=30, _2='Andy'), Row(_1=19, _2='Justin')]

Podanie schematu wprost przy tworzeniu DF

Typy danych: http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#module-pyspark.sql.types <br>
https://spark.apache.org/docs/latest/sql-programming-guide.html#data-types <br>
Kilka podstawowych: IntegerType, DoubleType, FloatType, StringType, BooleanType, NullType

In [25]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
schema = StructType([StructField("V1", IntegerType()), StructField("V2", StringType())])
df = spark.createDataFrame([[1,2],[3,4]], schema)

In [26]:
# Data Frame nadal jest lista wierszy
df.show()

+---+---+
| V1| V2|
+---+---+
|  1|  2|
|  3|  4|
+---+---+



> **TODO**: Z podanego RDD utworz DataFrame. <br>
Wskazowka: DataFrame stworzymy latwo z RDD Rows<br>

#### Ogolne wiadomosci na temat danych
printSchema, show, columns, dtypes <br>
Znane z RDD np: count, take, head

In [27]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [28]:
people.printSchema() ## Print the schema in a tree format

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [29]:
employees.printSchema()

root
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [30]:
people_txt.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: double (nullable = true)



In [31]:
people_txt.show(1)

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
+-------+----+
only showing top 1 row



In [32]:
people.show(1)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
+----+-------+
only showing top 1 row



In [33]:
people_txt.show(1)

+-------+----+
|    _c0| _c1|
+-------+----+
|Michael|29.0|
+-------+----+
only showing top 1 row



Liczba wierszy

In [34]:
people.count()

3

Lista kolumn

In [35]:
people.columns

['age', 'name']

Lista kolumn wraz z typami danych

In [36]:
people.dtypes

[('age', 'bigint'), ('name', 'string')]

#### Odwolania do poszczegolnych kolumn

In [37]:
people.age

Column<'age'>

In [38]:
people['age']

Column<'age'>

In [39]:
people[0]

Column<'age'>

#### Dodanie/usuniecie kolumny
withColumn, drop

In [40]:
people.show(2)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
+----+-------+
only showing top 2 rows



In [41]:
people.withColumn(colName = 'ageNextYear', col = people.age +1).show(2)

+----+-------+-----------+
| age|   name|ageNextYear|
+----+-------+-----------+
|null|Michael|       null|
|  30|   Andy|         31|
+----+-------+-----------+
only showing top 2 rows



In [42]:
people.drop('age').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



#### Zmiany nazwy kolumny

DataFrame people_txt ma domyslne nazwy kolumn ze Sparka. Niewiele nam one mowia.

In [43]:
people_txt = people_txt.withColumnRenamed('_c0', 'name').withColumnRenamed('_c1', 'age')
people_txt.show(1)

+-------+----+
|   name| age|
+-------+----+
|Michael|29.0|
+-------+----+
only showing top 1 row



#### Podstawowe statystyki kolumn w DataFrames
describe

In [45]:
people.describe()

DataFrame[summary: string, age: string, name: string]

In [46]:
employees.describe('salary').show()

+-------+-----------------+
|summary|           salary|
+-------+-----------------+
|  count|                4|
|   mean|           3750.0|
| stddev|645.4972243679028|
|    min|             3000|
|    max|             4500|
+-------+-----------------+



#### Braki danych
isNull, isNotNull<br>
fillna, dropna, replace

In [47]:
newPeopleDF.show()

+----+-----+
| age| name|
+----+-----+
|  24|Alice|
|null| null|
|  33| null|
|null|Peter|
|  32|Peter|
|  40| Greg|
+----+-----+



In [48]:
newPeopleDF.filter(newPeopleDF.age.isNull()).show()

+----+-----+
| age| name|
+----+-----+
|null| null|
|null|Peter|
+----+-----+



In [49]:
newPeopleDF.filter(newPeopleDF.age.isNotNull()).show()

+---+-----+
|age| name|
+---+-----+
| 24|Alice|
| 33| null|
| 32|Peter|
| 40| Greg|
+---+-----+



In [50]:
newPeopleDF.fillna(-1).show()

+---+-----+
|age| name|
+---+-----+
| 24|Alice|
| -1| null|
| 33| null|
| -1|Peter|
| 32|Peter|
| 40| Greg|
+---+-----+



In [51]:
newPeopleDF.fillna({'age':-1, 'name':'unknown'}).show()

+---+-------+
|age|   name|
+---+-------+
| 24|  Alice|
| -1|unknown|
| 33|unknown|
| -1|  Peter|
| 32|  Peter|
| 40|   Greg|
+---+-------+



In [52]:
newPeopleDF.fillna({'name':'unknown'}).replace('unknown', 'NN').show()

+----+-----+
| age| name|
+----+-----+
|  24|Alice|
|null|   NN|
|  33|   NN|
|null|Peter|
|  32|Peter|
|  40| Greg|
+----+-----+



In [53]:
newPeopleDF.dropna().show()

+---+-----+
|age| name|
+---+-----+
| 24|Alice|
| 32|Peter|
| 40| Greg|
+---+-----+



In [54]:
newPeopleDF.dropna(subset='age').show()

+---+-----+
|age| name|
+---+-----+
| 24|Alice|
| 33| null|
| 32|Peter|
| 40| Greg|
+---+-----+



In [55]:
newPeopleDF.dropna?

#### Funkcje wprost ze skladni SQL 
select, where (wymienne z filter), orderBy

In [56]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [57]:
people.\
select('name').\
where(people.name.like('%n%')).\
orderBy(people.age.asc()).\
show()

+------+
|  name|
+------+
|Justin|
|  Andy|
+------+



In [63]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



> **TODO**: Wyswietl imiona ludzi ze zbioru 'people' starszych niz 29 lat

In [64]:
people.\
select('name').\
where(people.age>29).show()

+----+
|name|
+----+
|Andy|
+----+



#### Operacje na zbiorach
union - dziala jak UNION ALL w SQL. <br>
intersect (INTERSECT z SQLa), subtract (EXCEPT z SQLa)

In [58]:
people.union?

In [59]:
people.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [60]:
people_txt.show()

+-------+----+
|   name| age|
+-------+----+
|Michael|29.0|
|   Andy|30.0|
| Justin|19.0|
+-------+----+



In [61]:
newPeopleDF.show()

+----+-----+
| age| name|
+----+-----+
|  24|Alice|
|null| null|
|  33| null|
|null|Peter|
|  32|Peter|
|  40| Greg|
+----+-----+

