In [1]:
import pyspark.sql.functions as f
from pyspark.sql.session import SparkSession , DataFrame


# Architecture
## Standalone
![image](jupyter-files/standalone.png)
## Kubernetes
![image](jupyter-files/kuber.png)
## Yarn

![image](jupyter-files/yarn.png)






# Make a SparkSession


In [2]:
spark = SparkSession.builder.appName("Presentation").getOrCreate()
spark

23/05/19 20:27:59 WARN Utils: Your hostname, hoseins-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.102 instead (on interface en0)
23/05/19 20:27:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/19 20:27:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read Data

In [None]:
# Read and create dataframes from diffrent file formats
df_people = spark.read.json('data/people.json')
df_sales = spark.read.csv('data/sales_info.csv',inferSchema=True,header=True)



In [None]:
# print schema
df_people.printSchema()
df_sales.printSchema()


In [None]:
# print schema
df_people.show()
df_sales.show()


# Basics (Spark SQL)

In [None]:
df_people

In [None]:
df_people['age']

In [None]:
# Check UI | what is a job? 
# df_people.select('name')
df_people.select('name').show()

In [None]:
# DataFrames, like RDDs, are immutable
df_people.withColumn("new_col", f.col("age")).show()

In [None]:
# Spark is lazy
new_people_df = df_people.withColumn("new_col", f.col("age"))

people_with_sales = new_people_df.join(df_sales, new_people_df.name==df_sales.PersonName)
people_with_sales.show()


In [None]:
people_with_sales.drop(f.col("PersonName")).show()


In [None]:
# GroupBy
people_with_sales.groupBy("Company").agg(f.sum("Sales")).show()

In [None]:
# use sql syntax
people_with_sales.createOrReplaceTempView("temp_table")
spark.sql("select * from temp_table").show()

# Spark Mlib

In [3]:
from pyspark.ml.clustering import KMeans

# Loads data.
dataset = spark.read.format("libsvm").load("data/sample_kmeans_data.txt")

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

ModuleNotFoundError: No module named 'numpy'

# Spark Streaming
![image](jupyter-files/streaming2.png)


### WordCount | SparkStreaming with windowing

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(spark.sparkContext, 3)
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
wordCounts.window(6).pprint()

print(type(lines))


ssc.start()
ssc.awaitTermination()


## SparkStructuredStreaming

In [None]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
# `nc -lk 9999`
lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("checkpointLocation","checkpoints") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(
   f.explode(
       f.split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .option("checkpointLocation", "checkpoint")\
    .format("console")
query.start()
