[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/jkanclerz/data-science-workshop-2021/blob/main/99--exercises/41--spark-nested.ipynb)

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -O spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

# dataset

```json
{
  imie,
  nazwisko,
  wiek,
  plec,
  dom: {
    ulica,
    numer_domu,
    kod_pocztowy,
    miasto,
    wojewodztwo,
    telefon
  },
  praca: {
    firma,
    stanowisko,
    ulica,
    numer_domu,
    kod_pocztowy,
    miasto,
    telefon,
    wynagrodzenie
  }
}
```

In [3]:
pip install -q pyspark findspark faker

Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import faker
import random
 
import pandas as pd

N_PROBS = 2000
  
def generate_person(f):
    
    plec = f.random_element(elements=("K", "M"))
    if plec == 'M':
        imie = f.first_name_male()
        nazwisko = f.last_name_male()
    else:
        imie = f.first_name_female()
        nazwisko = f.last_name_female()
    wiek = random.randint(25, 65)
    person = {
        'imie': imie,
        'nazwisko': nazwisko,
        'wiek': wiek,
        'plec': plec,
        'dom': {
            'ulica': f.street_name(),
            'numer_domu': f.building_number(),
            'kod_pocztowy': f.postcode(),
            'miasto': f.city(),
            'wojewodztwo': f.region(),
            'telefon': f.phone_number()
        },
        'praca': {
            'firma': f.company(),
            'stanowisko': f.job(),
            'ulica': f.street_name(),
            'numer_domu': f.building_number(),
            'kod_pocztowy': f.postcode(),
            'miasto': f.city(),
            'telefon': f.phone_number(),
            'wynagrodzenie': 200 * wiek + 100 * random.randint(-5, 5)
        }
    }
 
    return person

In [6]:
!mkdir -p employees_dataset

In [7]:
fake = faker.Faker(['pl_PL'])
for i in range(N_PROBS):
    osoba = generate_person(fake)
    with open("employees_dataset/%04d.json" % (i+1), "w") as file:
        json.dump(osoba, file)
        file.close()

# spark

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Test it")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
sc = spark.sparkContext

21/12/11 07:47:00 WARN Utils: Your hostname, Jakubs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.8.8 instead (on interface en0)
21/12/11 07:47:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/11 07:47:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/12/11 07:47:02 WARN Utils: Service 'SparkUI' could not bind on port 4050. Attempting port 4051.


# single

In [9]:
df = spark.read.json("employees_dataset/0001.json")
df.show()

+--------------------+-----+--------+----+--------------------+----+
|                 dom| imie|nazwisko|plec|               praca|wiek|
+--------------------+-----+--------+----+--------------------+----+
|{52-454, Koło, 46...|Liwia|  Łudzik|   K|{Grupa Morys, 24-...|  41|
+--------------------+-----+--------+----+--------------------+----+



In [10]:
df.printSchema()

root
 |-- dom: struct (nullable = true)
 |    |-- kod_pocztowy: string (nullable = true)
 |    |-- miasto: string (nullable = true)
 |    |-- numer_domu: string (nullable = true)
 |    |-- telefon: string (nullable = true)
 |    |-- ulica: string (nullable = true)
 |    |-- wojewodztwo: string (nullable = true)
 |-- imie: string (nullable = true)
 |-- nazwisko: string (nullable = true)
 |-- plec: string (nullable = true)
 |-- praca: struct (nullable = true)
 |    |-- firma: string (nullable = true)
 |    |-- kod_pocztowy: string (nullable = true)
 |    |-- miasto: string (nullable = true)
 |    |-- numer_domu: string (nullable = true)
 |    |-- stanowisko: string (nullable = true)
 |    |-- telefon: string (nullable = true)
 |    |-- ulica: string (nullable = true)
 |    |-- wynagrodzenie: long (nullable = true)
 |-- wiek: long (nullable = true)



## all

In [11]:
df = spark.read.json("employees_dataset/")

                                                                                

In [12]:
df.show(3)

+--------------------+--------+----------+----+--------------------+----+
|                 dom|    imie|  nazwisko|plec|               praca|wiek|
+--------------------+--------+----------+----+--------------------+----+
|{01-253, Świętoch...|Krystyna|  Wylegała|   K|{Dziedzina-Polito...|  27|
|{77-216, Grudziąd...|  Fabian|Szczubełek|   M|{Spółdzielnia Gra...|  51|
|{58-990, Sandomie...|   Kamil|    Konrad|   M|{Spółdzielnia Klę...|  26|
+--------------------+--------+----------+----+--------------------+----+
only showing top 3 rows



In [13]:
from pyspark.sql import functions as F

In [14]:
df \
    .select(
        F.col('dom.kod_pocztowy'),
        F.col('praca.kod_pocztowy'),
    )\
.show(5)

+------------+------------+
|kod_pocztowy|kod_pocztowy|
+------------+------------+
|      01-253|      21-443|
|      77-216|      38-365|
|      58-990|      75-408|
|      53-177|      59-578|
|      15-943|      78-660|
+------------+------------+
only showing top 5 rows



# writing

In [15]:
all_we_need_for_modeling_df = df.select('wiek',F.col('praca.wynagrodzenie').alias('wynagrodzenie'))

In [20]:
!rm -rf employees.parquet employees-by-age.parquet

In [21]:
all_we_need_for_modeling_df.write.parquet('employees.parquet')

                                                                                

In [22]:
all_we_need_for_modeling_df\
    .repartition(1)\
    .write\
    .partitionBy("wiek")\
    .parquet('employees-by-age.parquet')

                                                                                

In [23]:
sc.stop()