## SPARK documentation
https://spark.apache.org/docs/latest/api/python/index.html

In [None]:
# Import Spark bindings
execfile("/etc/spark/conf/spark_1.6.0_binings.py")

In [None]:
# Set distribution mode, appname, and claim resources
master='yarn-client' #"yarn-client" to run distributed mode in yarn, "local" to run local
AppName="Spark Template"
num_executors=1
exec_memory=1 #in GigaByte pr. executor. Tot mem = num_executors*exec_memory
driver_memory=1 #in GigaByte.


#############--==DO NOT EDIT==--###############
from pyspark import SparkConf
sconf=SparkConf()

sconf.set('spark.master',master)
sconf.set('spark.executor.instances',str(num_executors))#Number of executors
#sconf.set('spark.shuffle.service.enabled',True)
#sconf.set('spark.dynamicAllocation.enabled',True)
sconf.set('spark.executor.memory',str(exec_memory)+'g')
sconf.set('spark.driver.memory',str(driver_memory)+'g')
#sconf.set('spark.executor.cores','2') # number of cores on same worker
sconf.set('spark.app.name',AppName) #Application Name
sconf.set('spark.app.id',AppName)

from pyspark import SparkContext
sc = SparkContext(conf=sconf)
###############################################

## After running the two first cells. Go to the YARN application manager to verify that your job/ Spark Context is running:

http://10.10.110.1:8088/cluster/scheduler

In [None]:
# Import Spark DataFrame API's
from pyspark import HiveContext
sqlContext = HiveContext(sc)
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window

## IMPORTANT: when you are finished with your SPARK Notebook, remember to run "sc.stop()" to release recourses.

## Spark example using LinkedIn data

In [None]:
# Load to Spark
raw_data_spark=sc.textFile("/tmp/hadoop_nasjon_messy.txt")
raw_data_spark.take(5)

In [None]:
# Drop header
raw_data_spark1 = raw_data_spark.filter(lambda row: row[0][0] != '#')\
.map(lambda row: row.split(','))
raw_data_spark1.take(2)

In [None]:
# Parse Norway 273 as (273), Norway 
raw_data_spark2 = raw_data_spark1\
.map(lambda row: [row[0].split(' ')[1],row[0].split(' ')[0]]+row[1:])

raw_data_spark2.take(2)

In [None]:
# Remove unwanted chars and set dtypes
raw_data_spark3=raw_data_spark2\
.map(lambda row: [i.strip('()') for i in row])\
.map(lambda row: [int(i) if i.isdigit() else i for i in row])

raw_data_spark3.take(2)

In [None]:
# Data is parsed. Create schema and dataframe
schema = StructType([\
    StructField("num_country", IntegerType(), True),
    StructField("country", StringType(), True),
    StructField("num_top1", IntegerType(), True),
    StructField("top1_company", StringType(), True),
    StructField("num_top2", IntegerType(), True),
    StructField("top2_company", StringType(), True),
    StructField("num_top3", IntegerType(), True),
    StructField("top3_company", StringType(), True),
    StructField("num_top4", IntegerType(), True),
    StructField("top4_company", StringType(), True),
    StructField("num_top5", IntegerType(), True),
    StructField("top5_company", StringType(), True)])
    
hadoop_users = sqlContext.createDataFrame(raw_data_spark3, schema)
hadoop_users.show()

In [None]:
# You now have a Spark DataFrame, and can use the Spark SQL API if you register it as a table.
sqlContext.registerDataFrameAsTable(hadoop_users, "hadoop_users")
sqlContext.sql("SELECT * FROM hadoop_users WHERE country = 'Norway'").show()

In [None]:
sc.stop()