# Podstawy Apache Spark - obiekty RDD

In [None]:
!pip install findspark

In [9]:
import findspark
# jeśli w systemie jest ustawiona zmienna SPARK_HOME to można wykonać:
findspark.init()
# jeśli nie jest to w ten sposób
# findspark.init("katalog-ze-sparkiem")

In [None]:
import pyspark
sc = pyspark.SparkContext(appName="HelloSpark")

In [12]:
sc

In [72]:
arr = [1,2,3,4,5,6]
arr

[1, 2, 3, 4, 5, 6]

In [34]:
rdd = sc.parallelize(arr)
rdd

ParallelCollectionRDD[13] at readRDDFromFile at PythonRDD.scala:274

In [35]:
rdd.filter(lambda x: x%2==0).collect()

[2, 4, 6]

In [18]:
rdd.collect()

[1, 2, 3, 4, 5, 6]

In [19]:
rdd_power = rdd.map(lambda x: x*x)

In [20]:
rdd_power.collect()

[1, 4, 9, 16, 25, 36]

In [21]:
def power(x):
    return x*x

In [23]:
rdd_power = rdd.map(power)

In [24]:
rdd_power2 = rdd_power.map(power)

In [25]:
rdd_power2.collect()

[1, 16, 81, 256, 625, 1296]

In [26]:
rdd_power.map(power).map(power).map(power).map(power).collect()

[1,
 4294967296,
 1853020188851841,
 18446744073709551616,
 23283064365386962890625,
 7958661109946400884391936]

In [27]:
rdd.count()

6

In [28]:
rdd.take(2)

[1, 2]

In [73]:
file_rdd = sc.textFile("c:/opt/spark-3.1.1-bin-hadoop2.7/README.md")

In [75]:
file_rdd.count()

108

In [76]:
file_rdd.first()

'# Apache Spark'

In [77]:
filtered_rdd = file_rdd.filter(lambda linia: len(linia) > 0)

In [79]:
liczba_niepustych_lini = filtered_rdd.count()
liczba_niepustych_lini

67

In [80]:
filtered_rdd.filter(lambda linia: "Python" in linia).collect()

['high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
 '## Interactive Python Shell',
 'Alternatively, if you prefer Python, you can use the Python shell:']

In [43]:
words = filtered_rdd.map(lambda linia: linia.split())

In [50]:
words.map(lambda slowa: len(slowa)).take(10)

[3, 12, 13, 11, 12, 8, 6, 1, 2, 2]

In [52]:
words_flat_rdd = filtered_rdd.flatMap(lambda linia: linia.split())

In [54]:
words_flat_rdd.take(10)

['#',
 'Apache',
 'Spark',
 'Spark',
 'is',
 'a',
 'unified',
 'analytics',
 'engine',
 'for']

In [55]:
words_flat_rdd.count()

495

In [56]:
words_flat_rdd.distinct().count()

288

In [60]:
stop_words = ["for", "is", "the", "of"]

In [57]:
distinct_words = words_flat_rdd.distinct()

In [61]:
ok_words = distinct_words.filter(lambda w: len(w) > 1).filter(lambda w: w not in stop_words)

In [62]:
ok_words.count()

279

In [64]:
ok_words.take(5)

['Apache', 'Spark', 'unified', 'analytics', 'engine']

In [65]:
ok_words.saveAsTextFile("c:/opt/spark-3.1.1-bin-hadoop2.7/words_ok.txt")

In [66]:
word_tuples = words_flat_rdd.map(lambda word: (word, 1))

In [67]:
word_tuples.take(5)

[('#', 1), ('Apache', 1), ('Spark', 1), ('Spark', 1), ('is', 1)]

In [69]:
word_tuples.reduceByKey(lambda a, b: a + b).collect()

[('#', 1),
 ('Apache', 1),
 ('Spark', 14),
 ('is', 7),
 ('unified', 1),
 ('analytics', 1),
 ('engine', 2),
 ('It', 2),
 ('provides', 1),
 ('high-level', 1),
 ('APIs', 1),
 ('in', 5),
 ('Scala,', 1),
 ('Java,', 1),
 ('an', 4),
 ('optimized', 1),
 ('supports', 2),
 ('computation', 1),
 ('analysis.', 1),
 ('set', 2),
 ('of', 5),
 ('tools', 1),
 ('SQL', 2),
 ('MLlib', 1),
 ('machine', 1),
 ('learning,', 1),
 ('GraphX', 1),
 ('graph', 1),
 ('processing,', 1),
 ('Structured', 1),
 ('<https://spark.apache.org/>', 1),
 ('Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)',
  1),
 ('Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)',
  1),
 (