## SPARK documentation
https://spark.apache.org/docs/latest/api/python/index.html

In [1]:
# Import Spark bindings
execfile("/etc/spark/conf/spark_2.0.1_binings.py")

In [2]:
-

## After running the two first cells. Go to the YARN application manager to verify that your job/ Spark Context is running:

http://10.10.110.1:8088/cluster/scheduler

In [4]:
# Import Spark DataFrame API's
from pyspark import HiveContext
sqlContext = HiveContext(sc)
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window

## IMPORTANT: when you are finished with your SPARK Notebook, remember to run "sc.stop()" to release recourses.

## Spark example using LinkedIn data

In [3]:
# Load to Spark
raw_data_spark=sc.textFile("/tmp/hadoop_nasjon_messy.txt")
raw_data_spark.take(5)

[u'# Number of users with "Hadoop" in their profile',
 u'Norway 273,(8),Telenor,(8),DNV GL,(5),Affecto,(5),Tata Consultancy Services,(4),Microsoft',
 u'Sweden 863,(55),Ericsson,(44),Spotify,(28),Klarna,(25),King,(18),KTH Royal Institute of Technology',
 u'France 6170,(137),Capgemini,(131),Amadeus IT Group,(117),Orange,(99),Criteo,(87),Sopra Steria',
 u'Denmark 419,(20),Nordea,(15),Danske Bank,(8),Microsoft,(8),SKAT,(7),Maersk Line']

In [6]:
# Drop header
raw_data_spark1 = raw_data_spark.filter(lambda row: row[0][0] != '#')\
.map(lambda row: row.split(','))
raw_data_spark1.take(2)

[[u'Norway 273',
  u'(8)',
  u'Telenor',
  u'(8)',
  u'DNV GL',
  u'(5)',
  u'Affecto',
  u'(5)',
  u'Tata Consultancy Services',
  u'(4)',
  u'Microsoft'],
 [u'Sweden 863',
  u'(55)',
  u'Ericsson',
  u'(44)',
  u'Spotify',
  u'(28)',
  u'Klarna',
  u'(25)',
  u'King',
  u'(18)',
  u'KTH Royal Institute of Technology']]

In [7]:
# Parse Norway 273 as (273), Norway 
raw_data_spark2 = raw_data_spark1\
.map(lambda row: [row[0].split(' ')[1],row[0].split(' ')[0]]+row[1:])

raw_data_spark2.take(2)

[[u'273',
  u'Norway',
  u'(8)',
  u'Telenor',
  u'(8)',
  u'DNV GL',
  u'(5)',
  u'Affecto',
  u'(5)',
  u'Tata Consultancy Services',
  u'(4)',
  u'Microsoft'],
 [u'863',
  u'Sweden',
  u'(55)',
  u'Ericsson',
  u'(44)',
  u'Spotify',
  u'(28)',
  u'Klarna',
  u'(25)',
  u'King',
  u'(18)',
  u'KTH Royal Institute of Technology']]

In [8]:
# Remove unwanted chars and set dtypes
raw_data_spark3=raw_data_spark2\
.map(lambda row: [i.strip('()') for i in row])\
.map(lambda row: [int(i) if i.isdigit() else i for i in row])

raw_data_spark3.take(2)

[[273,
  u'Norway',
  8,
  u'Telenor',
  8,
  u'DNV GL',
  5,
  u'Affecto',
  5,
  u'Tata Consultancy Services',
  4,
  u'Microsoft'],
 [863,
  u'Sweden',
  55,
  u'Ericsson',
  44,
  u'Spotify',
  28,
  u'Klarna',
  25,
  u'King',
  18,
  u'KTH Royal Institute of Technology']]

In [9]:
# Data is parsed. Create schema and dataframe
schema = StructType([\
    StructField("num_country", IntegerType(), True),
    StructField("country", StringType(), True),
    StructField("num_top1", IntegerType(), True),
    StructField("top1_company", StringType(), True),
    StructField("num_top2", IntegerType(), True),
    StructField("top2_company", StringType(), True),
    StructField("num_top3", IntegerType(), True),
    StructField("top3_company", StringType(), True),
    StructField("num_top4", IntegerType(), True),
    StructField("top4_company", StringType(), True),
    StructField("num_top5", IntegerType(), True),
    StructField("top5_company", StringType(), True)])
    
hadoop_users = sqlContext.createDataFrame(raw_data_spark3, schema)
hadoop_users.show()

+-----------+-------+--------+--------------------+--------+----------------+--------+--------------------+--------+--------------------+--------+--------------------+
|num_country|country|num_top1|        top1_company|num_top2|    top2_company|num_top3|        top3_company|num_top4|        top4_company|num_top5|        top5_company|
+-----------+-------+--------+--------------------+--------+----------------+--------+--------------------+--------+--------------------+--------+--------------------+
|        273| Norway|       8|             Telenor|       8|          DNV GL|       5|             Affecto|       5|Tata Consultancy ...|       4|           Microsoft|
|        863| Sweden|      55|            Ericsson|      44|         Spotify|      28|              Klarna|      25|                King|      18|KTH Royal Institu...|
|       6170| France|     137|           Capgemini|     131|Amadeus IT Group|     117|              Orange|      99|              Criteo|      87|        Sopra 

In [10]:
# You now have a Spark DataFrame, and can use the Spark SQL API if you register it as a table.
sqlContext.registerDataFrameAsTable(hadoop_users, "hadoop_users")
sqlContext.sql("SELECT * FROM hadoop_users WHERE country = 'Norway'").show()

+-----------+-------+--------+------------+--------+------------+--------+------------+--------+--------------------+--------+------------+
|num_country|country|num_top1|top1_company|num_top2|top2_company|num_top3|top3_company|num_top4|        top4_company|num_top5|top5_company|
+-----------+-------+--------+------------+--------+------------+--------+------------+--------+--------------------+--------+------------+
|        273| Norway|       8|     Telenor|       8|      DNV GL|       5|     Affecto|       5|Tata Consultancy ...|       4|   Microsoft|
+-----------+-------+--------+------------+--------+------------+--------+------------+--------+--------------------+--------+------------+



In [5]:
sc.stop()