# SPARK

# Task 1.
### Initialize the Spark session

In [1]:
import findspark
findspark.init()
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

print(spark.version)

3.3.0


# Task 2.
- generate collection with number from 1 to 20
- print this collection
- print collection size
- sum all numbers
- calculate average
- print numbers greater than average

In [2]:
numbers = spark.range(1,21).rdd.map(lambda el: el.id)

# print collection
print(f'Collection: {numbers.collect()}')

# collection size
collectionSize = numbers.count()
print(f'Collection size: {collectionSize}')

# sum of elements
sum = numbers.reduce(lambda x,y: x+y)
print(f'Sum numbers in collection: {sum}')

# calc. average
average = sum/collectionSize
print(f'Average: {average}')

# numbers greater than average
greater = numbers.filter(lambda el: el > average).collect()
print(f'Numbers greater than average: {greater}')

Collection: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Collection size: 20
Sum numbers in collection: 210
Average: 10.5
Numbers greater than average: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


# Task 3.
- read file and load content to collection
- print collection size
- show first five lines
- create new collection with lines containing the word **Spark** and print first three lines from this collection

In [3]:
data = spark.sparkContext.textFile("textfile.md")

# collection size
print(data.count())

# show first five lines
print(data.take(5))

# create new collection with lines containing the word "Spark"
dataSpark = data.filter(lambda el: el.find("Spark") != -1)

# new collection size
print(dataSpark.count())

# print first three lines from this collection
print(dataSpark.take(3))

108
['# Apache Spark', '', 'Spark is a unified analytics engine for large-scale data processing. It provides', 'high-level APIs in Scala, Java, Python, and R, and an optimized engine that', 'supports general computation graphs for data analysis. It also supports a']
19
['# Apache Spark', 'Spark is a unified analytics engine for large-scale data processing. It provides', 'rich set of higher-level tools including Spark SQL for SQL and DataFrames,']


# Task 4.
- count all characters in the "data" collection
- count all characters in the "dataSpark" collection

In [4]:
print(data.map(lambda el: len(el)).reduce(lambda x,y: x+y))

print(dataSpark.map(lambda el: len(el)).reduce(lambda x,y: x+y))

4380
1507


# Task 5.
- create new collection with lines containing the word **example**
- print this collection

In [5]:
dataExample = data.filter(lambda el: el.find("example") != -1)

print(dataExample.collect())

['To build Spark and its example programs, run:', 'Spark also comes with several sample programs in the `examples` directory.', 'To run one of them, use `./bin/run-example <class> [params]`. For example:', '    ./bin/run-example SparkPi', 'will run the Pi example locally.', 'You can set the MASTER environment variable when running examples to submit', 'examples to a cluster. This can be a mesos:// or spark:// URL,', 'can also use an abbreviated class name if the class is in the `examples`', '    MASTER=spark://host:7077 ./bin/run-example SparkPi', 'Many of the example programs print usage help if no params are given.']


# Task 6.
- read file **people.json** and load content to dataset
- print dataset

In [6]:
ds = spark.read.json("people.json")
ds.show()

+---+-------+
|age|   name|
+---+-------+
| 18| Michał|
| 30|Andrzej|
| 19| Tomasz|
| 24|   Ania|
| 32|  Basia|
| 19|  Iwona|
| 22| Monika|
| 17| Marcin|
| 31| Łukasz|
+---+-------+



# Task 7.
- print only "name" column using SQL and HiveSQL

In [7]:
from pyspark.sql.functions import *

ds.createOrReplaceTempView("ds")

# using SQL dialect
spark.sql("SELECT name FROM ds").show()

# using HiveSQL dialect
ds.select("name").show()

+-------+
|   name|
+-------+
| Michał|
|Andrzej|
| Tomasz|
|   Ania|
|  Basia|
|  Iwona|
| Monika|
| Marcin|
| Łukasz|
+-------+

+-------+
|   name|
+-------+
| Michał|
|Andrzej|
| Tomasz|
|   Ania|
|  Basia|
|  Iwona|
| Monika|
| Marcin|
| Łukasz|
+-------+



# Task 8.

### 8.1. Display people who who are older than 21 years old

In [8]:
ds.filter(ds.age>21).show()

+---+-------+
|age|   name|
+---+-------+
| 30|Andrzej|
| 24|   Ania|
| 32|  Basia|
| 22| Monika|
| 31| Łukasz|
+---+-------+



### 8.2. Display average age.

In [9]:
ds.select(avg("age")).show()

+------------------+
|          avg(age)|
+------------------+
|23.555555555555557|
+------------------+



### 8.3. Display names that begin with the letter A.

In [10]:
ds.filter(col("name").like("A%")).select("name").show()

+-------+
|   name|
+-------+
|Andrzej|
|   Ania|
+-------+



### 8.4. Display age of the youngest and oldest person.

In [11]:
ds.select(min("age").alias("Youngest"), max("age").alias("Oldest")).show()

+--------+------+
|Youngest|Oldest|
+--------+------+
|      17|    32|
+--------+------+



### 8.5. Calculate average age, show people younger and older than average age.

In [12]:
averageAge = ds.select(avg('age').cast("int")).collect()[0][0]

print(f'Average age: {averageAge}')

print("People younger than average age:")
ds.filter(ds.age < averageAge).show()

print("People older than average age:")
ds.filter(ds.age > averageAge).show()

Average age: 23
People younger than average age:
+---+------+
|age|  name|
+---+------+
| 18|Michał|
| 19|Tomasz|
| 19| Iwona|
| 22|Monika|
| 17|Marcin|
+---+------+

People older than average age:
+---+-------+
|age|   name|
+---+-------+
| 30|Andrzej|
| 24|   Ania|
| 32|  Basia|
| 31| Łukasz|
+---+-------+



# Task 9.
- import class Person from file "Person.py"
- import data from 'adult.data'
- count elements
- print first element

In [13]:
from person import Person
lines = spark.sparkContext.textFile("adult.data").filter(lambda l: len(l)>0 and l.find("?")<0)
data = lines.map(lambda line: Person(line.split(", ")))

# testing
print(data.count())
print(data.first())

30162
{'age': 39, 'workclass': 'State-gov', 'fnlwgt': '77516', 'education': 'Bachelors', 'educationNum': 13, 'maritalStatus': 'Never-married', 'occupation': 'Adm-clerical', 'relationship': 'Not-in-family', 'race': 'White', 'sex': 'Male', 'capitalGain': '2174', 'capitalLoss': '0', 'hoursPerWeek': 40, 'nativeCountry': 'United-States', 'gainClass': '<=50K'}


# Task 10.

### 10.1. Womens from Mexico with HS-grad education.

In [14]:
data.filter(lambda o: o.sex=="Female" and o.nativeCountry=="Mexico" and o.education=="HS-grad").collect()

[{23, Mexico, HS-grad...},
 {31, Mexico, HS-grad...},
 {32, Mexico, HS-grad...},
 {53, Mexico, HS-grad...},
 {29, Mexico, HS-grad...},
 {40, Mexico, HS-grad...},
 {20, Mexico, HS-grad...},
 {26, Mexico, HS-grad...},
 {20, Mexico, HS-grad...},
 {25, Mexico, HS-grad...},
 {56, Mexico, HS-grad...},
 {24, Mexico, HS-grad...},
 {20, Mexico, HS-grad...},
 {29, Mexico, HS-grad...},
 {25, Mexico, HS-grad...},
 {20, Mexico, HS-grad...},
 {22, Mexico, HS-grad...},
 {31, Mexico, HS-grad...},
 {30, Mexico, HS-grad...},
 {26, Mexico, HS-grad...},
 {26, Mexico, HS-grad...},
 {27, Mexico, HS-grad...},
 {25, Mexico, HS-grad...},
 {34, Mexico, HS-grad...},
 {37, Mexico, HS-grad...},
 {41, Mexico, HS-grad...},
 {22, Mexico, HS-grad...},
 {51, Mexico, HS-grad...},
 {20, Mexico, HS-grad...},
 {22, Mexico, HS-grad...},
 {28, Mexico, HS-grad...},
 {33, Mexico, HS-grad...},
 {24, Mexico, HS-grad...}]

### 10.2. Number of womens and men who took part in the census.

In [15]:
df = spark.createDataFrame(data)
df.groupBy("sex").agg(count("sex")).show()

+------+----------+
|   sex|count(sex)|
+------+----------+
|Female|      9782|
|  Male|     20380|
+------+----------+



### 10.3. Average age of people with marital status "Never-married" and workclass "Private".

In [16]:
df.filter((df.maritalStatus=="Never-married") & (df.workclass=="Private")).agg(avg("age").alias("Average age")).show()

+------------------+
|       Average age|
+------------------+
|27.633520249221185|
+------------------+



### 10.4. Number of people with a numerical education of 9 to 13 working 20 to 30 hours a week grouped by workclass.

In [17]:
df.filter(((df.educationNum>=9)&(df.educationNum<=13))&((df.hoursPerWeek>=20)&(df.hoursPerWeek<=30))).groupBy("workclass").agg(count("workclass").alias("Number of people")).show()

+----------------+----------------+
|       workclass|Number of people|
+----------------+----------------+
|Self-emp-not-inc|             223|
|       Local-gov|             104|
|       State-gov|              85|
|         Private|            1842|
|     Without-pay|               5|
|     Federal-gov|              27|
|    Self-emp-inc|              47|
+----------------+----------------+



### 10.5. Numbers of people from the US with the same education.

In [18]:
df.filter(df.nativeCountry=="United-States").groupBy("education").agg(count("education").alias("Number of people")).show()

+------------+----------------+
|   education|Number of people|
+------------+----------------+
|     Masters|            1484|
|        10th|             752|
|     5th-6th|              78|
|  Assoc-acdm|             939|
|   Assoc-voc|            1233|
|     7th-8th|             437|
|         9th|             350|
|     HS-grad|            9209|
|   Bachelors|            4618|
|        11th|             957|
|     1st-4th|              39|
|   Preschool|              15|
|        12th|             331|
|   Doctorate|             314|
|Some-college|            6260|
| Prof-school|             488|
+------------+----------------+



### 10.6. Average age of people with gain class ">50K" in each occupation group.

In [19]:
df.filter(df.gainClass==">50K").groupBy("occupation").agg(avg("age").alias("Average age")).show()

+-----------------+------------------+
|       occupation|       Average age|
+-----------------+------------------+
|            Sales| 44.35876288659794|
|  Exec-managerial|44.893649974186886|
|   Prof-specialty|43.600220872446165|
|Handlers-cleaners| 43.24096385542169|
|  Farming-fishing| 47.06086956521739|
|     Craft-repair| 43.73568281938326|
| Transport-moving| 44.51724137931034|
|  Protective-serv| 41.48095238095238|
|    Other-service|41.371212121212125|
|     Tech-support| 43.14028776978417|
|Machine-op-inspct| 42.29795918367347|
|     Adm-clerical|43.299196787148595|
|  Priv-house-serv|              47.0|
|     Armed-Forces|              46.0|
+-----------------+------------------+



# Close spark session

In [20]:
spark.stop()