In [1]:
# Example: learning about spark and data preparation using MLlib.
# Dataset: datasets/salary.csv, designation.json
# Author: Humberto Bianchini

In [2]:
# 1) Importing all necessary libraries and Spark session creation.
import math
import random
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.mllib.stat import Statistics
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType

spark = SparkSession.builder.appName("Data Preparation").getOrCreate()

In [3]:
# 2) Creating a RDD (Resilient Distributed Dataset) and reading the two datasets.
employers = spark.sparkContext.parallelize([(1, "Joao", 25), (2, "Ricardo", 35), (3, "Marcio", 24), \
                           (4, "Janete", 28), (5, "Kely", 26), (6, "Vicente", 35), \
                           (7, "Jander", 38), (8, "Maria", 32), (9, "Gabriel", 29), \
                           (10, "Kimberly", 29), (11, "Alex", 28), (12, "Gustavo", 25), \
                           (13, "Rafael", 31)]).toDF(["emp_id","name","age"])
employers.show(5)

+------+-------+---+
|emp_id|   name|age|
+------+-------+---+
|     1|   Joao| 25|
|     2|Ricardo| 35|
|     3| Marcio| 24|
|     4| Janete| 28|
|     5|   Kely| 26|
+------+-------+---+
only showing top 5 rows



In [4]:
# 3) Reading and showing the first rows of the datasets.
salary = spark.read.json("salary.json")
position = spark.read.json("designation.json")

In [5]:
salary.show(5)

+----+------+
|e_id|salary|
+----+------+
|   1| 10000|
|   2| 12000|
|   3| 12000|
|   4|  null|
|   5|   120|
+----+------+
only showing top 5 rows



In [6]:
position.show(5)

+---+----------+
| id|  position|
+---+----------+
|  1|Associated|
|  2| Tech Lead|
|  3| Tech Lead|
|  4|Associated|
|  5| Tech Lead|
+---+----------+
only showing top 5 rows



In [7]:
# 4) Creating a final dataframe with the data.
df_final = employers.join(salary, employers.emp_id == salary.e_id).join(position, employers.emp_id == position.id).select("e_id", "name", "age", "position", "salary")

In [8]:
df_final.show()

+----+--------+---+----------------+------+
|e_id|    name|age|        position|salary|
+----+--------+---+----------------+------+
|   1|    Joao| 25|      Associated| 10000|
|   2| Ricardo| 35|       Tech Lead| 12000|
|   3|  Marcio| 24|       Tech Lead| 12000|
|   4|  Janete| 28|      Associated|  null|
|   5|    Kely| 26|       Tech Lead|   120|
|   6| Vicente| 35|Senior Tech Lead| 22000|
|   7|  Jander| 38|Senior Tech Lead| 20000|
|   8|   Maria| 32|       Tech Lead| 12000|
|   9| Gabriel| 29|       Tech Lead| 10000|
|  10|Kimberly| 29|      Associated|  8000|
|  11|    Alex| 28|       Tech Lead| 12000|
|  12| Gustavo| 25|       Tech Lead| 12000|
|  13|  Rafael| 31|       Tech Lead|120000|
+----+--------+---+----------------+------+



In [9]:
# 5) Dataframe treatment (dropping null rows or inserting salary mean in the null value)
clean_data = df_final.na.drop()
clean_data.show()

+----+--------+---+----------------+------+
|e_id|    name|age|        position|salary|
+----+--------+---+----------------+------+
|   1|    Joao| 25|      Associated| 10000|
|   2| Ricardo| 35|       Tech Lead| 12000|
|   3|  Marcio| 24|       Tech Lead| 12000|
|   5|    Kely| 26|       Tech Lead|   120|
|   6| Vicente| 35|Senior Tech Lead| 22000|
|   7|  Jander| 38|Senior Tech Lead| 20000|
|   8|   Maria| 32|       Tech Lead| 12000|
|   9| Gabriel| 29|       Tech Lead| 10000|
|  10|Kimberly| 29|      Associated|  8000|
|  11|    Alex| 28|       Tech Lead| 12000|
|  12| Gustavo| 25|       Tech Lead| 12000|
|  13|  Rafael| 31|       Tech Lead|120000|
+----+--------+---+----------------+------+



In [10]:
mean_salary = math.floor(salary.select(F.mean('salary')).collect()[0][0])
print(mean_salary)
clean_data = df_final.na.fill({'salary': mean_salary})
clean_data.show()

20843
+----+--------+---+----------------+------+
|e_id|    name|age|        position|salary|
+----+--------+---+----------------+------+
|   1|    Joao| 25|      Associated| 10000|
|   2| Ricardo| 35|       Tech Lead| 12000|
|   3|  Marcio| 24|       Tech Lead| 12000|
|   4|  Janete| 28|      Associated| 20843|
|   5|    Kely| 26|       Tech Lead|   120|
|   6| Vicente| 35|Senior Tech Lead| 22000|
|   7|  Jander| 38|Senior Tech Lead| 20000|
|   8|   Maria| 32|       Tech Lead| 12000|
|   9| Gabriel| 29|       Tech Lead| 10000|
|  10|Kimberly| 29|      Associated|  8000|
|  11|    Alex| 28|       Tech Lead| 12000|
|  12| Gustavo| 25|       Tech Lead| 12000|
|  13|  Rafael| 31|       Tech Lead|120000|
+----+--------+---+----------------+------+



In [11]:
# 6) Data transformation.
concat_func = F.udf(lambda name, age: name + "_" + str(age))
concat_df = clean_data.withColumn("name_age", concat_func(clean_data.name, clean_data.age))
concat_df.show()

+----+--------+---+----------------+------+-----------+
|e_id|    name|age|        position|salary|   name_age|
+----+--------+---+----------------+------+-----------+
|   1|    Joao| 25|      Associated| 10000|    Joao_25|
|   2| Ricardo| 35|       Tech Lead| 12000| Ricardo_35|
|   3|  Marcio| 24|       Tech Lead| 12000|  Marcio_24|
|   4|  Janete| 28|      Associated| 20843|  Janete_28|
|   5|    Kely| 26|       Tech Lead|   120|    Kely_26|
|   6| Vicente| 35|Senior Tech Lead| 22000| Vicente_35|
|   7|  Jander| 38|Senior Tech Lead| 20000|  Jander_38|
|   8|   Maria| 32|       Tech Lead| 12000|   Maria_32|
|   9| Gabriel| 29|       Tech Lead| 10000| Gabriel_29|
|  10|Kimberly| 29|      Associated|  8000|Kimberly_29|
|  11|    Alex| 28|       Tech Lead| 12000|    Alex_28|
|  12| Gustavo| 25|       Tech Lead| 12000| Gustavo_25|
|  13|  Rafael| 31|       Tech Lead|120000|  Rafael_31|
+----+--------+---+----------------+------+-----------+



In [12]:
# 7) Converting a simples function to converting a salary, Real(R$) to dollar($))
def realDolar(salary):
  return salary*0.25
real_dolar = F.udf(lambda salary: realDolar(salary), LongType())
df_real_dolar = clean_data.withColumn("US$ Salary", real_dolar(clean_data.salary))
#df_real_dolar.show()

Correlações

In [13]:
# 8) Using random series to see how correlation works.
serie_1 = spark.sparkContext.parallelize(random.sample(range(1,101),10))
serie_2 = spark.sparkContext.parallelize(random.sample(range(1,101),10))
serie_3=serie_1.map(realDolar)

In [14]:
# Correlation between to random series.
correlation = Statistics.corr(serie_1, serie_2, method = "pearson")
print(correlation)

0.6052812063142593


In [15]:
# Correlation between two similar series.
correlation = Statistics.corr(serie_1, serie_3, method = "pearson")
print(correlation)

1.0
