# dataset

```json
{
  imie,
  nazwisko,
  wiek,
  plec,
  dom: {
    ulica,
    numer_domu,
    kod_pocztowy,
    miasto,
    wojewodztwo,
    telefon
  },
  praca: {
    firma,
    stanowisko,
    ulica,
    numer_domu,
    kod_pocztowy,
    miasto,
    telefon,
    wynagrodzenie
  }
}
```

In [36]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [37]:
import json
import faker
import random
 
import pandas as pd

N_PROBS = 2000
  
def generate_person(f):
    
    plec = f.random_element(elements=("K", "M"))
    if plec == 'M':
        imie = f.first_name_male()
        nazwisko = f.last_name_male()
    else:
        imie = f.first_name_female()
        nazwisko = f.last_name_female()
    wiek = random.randint(25, 65)
    person = {
        'imie': imie,
        'nazwisko': nazwisko,
        'wiek': wiek,
        'plec': plec,
        'dom': {
            'ulica': f.street_name(),
            'numer_domu': f.building_number(),
            'kod_pocztowy': f.postcode(),
            'miasto': f.city(),
            'wojewodztwo': f.region(),
            'telefon': f.phone_number()
        },
        'praca': {
            'firma': f.company(),
            'stanowisko': f.job(),
            'ulica': f.street_name(),
            'numer_domu': f.building_number(),
            'kod_pocztowy': f.postcode(),
            'miasto': f.city(),
            'telefon': f.phone_number(),
            'wynagrodzenie': 200 * wiek + 100 * random.randint(-5, 5)
        }
    }
 
    return person

In [38]:
!mkdir employees_dataset

mkdir: cannot create directory ‘employees_dataset’: File exists


In [39]:
fake = faker.Faker(['pl_PL'])
for i in range(N_PROBS):
    osoba = generate_person(fake)
    with open("employees_dataset/%04d.json" % (i+1), "w") as file:
        json.dump(osoba, file)
        file.close()

# spark

In [40]:
from pyspark import SparkContext
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Spark master") \
    .getOrCreate()
sc = spark.sparkContext

# single

In [41]:
df = spark.read.json("employees_dataset/0001.json")
df.show()

+--------------------+---------+--------+----+--------------------+----+
|                 dom|     imie|nazwisko|plec|               praca|wiek|
+--------------------+---------+--------+----+--------------------+----+
|[00-087, Tychy, 9...|Agnieszka| Wlaźlak|   K|[FPUH Goworek i s...|  35|
+--------------------+---------+--------+----+--------------------+----+



In [42]:
df.printSchema()

root
 |-- dom: struct (nullable = true)
 |    |-- kod_pocztowy: string (nullable = true)
 |    |-- miasto: string (nullable = true)
 |    |-- numer_domu: string (nullable = true)
 |    |-- telefon: string (nullable = true)
 |    |-- ulica: string (nullable = true)
 |    |-- wojewodztwo: string (nullable = true)
 |-- imie: string (nullable = true)
 |-- nazwisko: string (nullable = true)
 |-- plec: string (nullable = true)
 |-- praca: struct (nullable = true)
 |    |-- firma: string (nullable = true)
 |    |-- kod_pocztowy: string (nullable = true)
 |    |-- miasto: string (nullable = true)
 |    |-- numer_domu: string (nullable = true)
 |    |-- stanowisko: string (nullable = true)
 |    |-- telefon: string (nullable = true)
 |    |-- ulica: string (nullable = true)
 |    |-- wynagrodzenie: long (nullable = true)
 |-- wiek: long (nullable = true)



## all

In [43]:
df = spark.read.json("employees_dataset/")

In [44]:
df.show(3)

+--------------------+--------+---------+----+--------------------+----+
|                 dom|    imie| nazwisko|plec|               praca|wiek|
+--------------------+--------+---------+----+--------------------+----+
|[38-713, Żyrardów...|Apolonia|Kamieniak|   K|[Spółdzielnia Kle...|  43|
|[28-613, Wodzisła...|  Ernest|  Breguła|   M|[Fundacja Palusza...|  49|
|[60-196, Świebodz...| Tadeusz|    Hajda|   M|[Drożdżal-Wrześni...|  29|
+--------------------+--------+---------+----+--------------------+----+
only showing top 3 rows



In [45]:
from pyspark.sql import functions as F

In [46]:
df \
    .select(
        F.col('dom.kod_pocztowy'),
        F.col('praca.kod_pocztowy'),
    )\
.show(5)

+------------+------------+
|kod_pocztowy|kod_pocztowy|
+------------+------------+
|      38-713|      97-248|
|      28-613|      78-521|
|      60-196|      38-550|
|      56-892|      96-325|
|      88-302|      37-918|
+------------+------------+
only showing top 5 rows



# writing

In [49]:
all_we_need_for_modeling_df = df.select('wiek',F.col('praca.wynagrodzenie').alias('wynagrodzenie'))

In [50]:
all_we_need_for_modeling_df.write.parquet('employees.parquet')

In [51]:
all_we_need_for_modeling_df\
    .repartition(1)\
    .write\
    .partitionBy("wiek")\
    .parquet('employees-by-age.parquet')

In [52]:
sc.stop()