# SparkSQL assignment with Python
## José Vicente Mellado

### Reading file

In [None]:
## No need to run these lines, everything has been configured in spark-env.sh

#import findspark
#findspark.init('/spark_dir')

##Configuramos el sparksession
#import pyspark
#from pyspark.sql import SparkSession

#spark = (SparkSession.builder
#         .master('local[*]')
#         .config('spark.driver.cores', 1)
#         .appName('estudio_spark')
#         .getOrCreate()
#        )
##obtenemos el sparkcontext a partir del sparksession
#sc = spark.sparkContext

In [1]:
from pyspark.sql.types import *

st = StructType([
        StructField('ID', LongType(), True),
        StructField('PARENT-SYS-ID', StringType(), True),
        StructField('Source', StringType(), True),
        StructField('Mentions', StringType(), True),
        StructField('Target', StringType(), True),
        StructField('NAME Source', StringType(), True),
        StructField('BODY', StringType(), True),
        StructField('PUBDATE', TimestampType(), True),
        StructField('URLs coma separated', StringType(), True),
        StructField('Type TW-RT-MT', StringType(), True),
        StructField('LINK', StringType(), True),
        StructField('n1 Link', ByteType(), True),
        StructField('n1 Picture', ByteType(), True),
        StructField('PERSONAL-WEBSITE', StringType(), True),
        StructField('COUNTRY', StringType(), True),
        StructField('ALL-NICK-ACTIVITY-EVER', LongType(), True),
        StructField('NICK-FOLLOWERS', LongType(), True),
        StructField('FRIENDS-FOLLOWING-AUDIENCE', LongType(), True),
        StructField('LOCATION', StringType(), True)
    ]
)

#https://spark.apache.org/docs/2.0.0-preview/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader
df = spark.read.csv('tweets.csv', 
                    header=True, 
                    sep='\t',
                    schema=st,
                    timestampFormat='dd/MM/yyyy HH:mm',
                    mode='PERMISSIVE')

### a) Contabilizar el número total de menciones a los pilotos Marc Márquez, Valentino Rossi y Dani Pedrosa.



In [51]:
(df.select('Mentions')
 .filter(df['Mentions'].like('%marcmarquez93%') | 
         df['Mentions'].like('%valeyellow46%') | 
         df['Mentions'].like('%26_danipedrosa%'))
 .rdd.flatMap(lambda mentions: 
              filter(lambda x: x == 'marcmarquez93' or x == 'valeyellow46' or x == '26_danipedrosa', 
                     list(set(mentions[0].split(',')))
                    )
             )
 .map(lambda user: (user, 1))
 .reduceByKey(lambda a, b: a + b)
 .collect()
)

# Alternative query
#from pyspark.sql.functions import explode, split
#
#(df.select('ID', 'Mentions')
# .filter(df['Mentions'].like('%marcmarquez93%') | 
#         df['Mentions'].like('%valeyellow46%') | 
#         df['Mentions'].like('%26_danipedrosa%'))
# .withColumn('Mentions', explode((split('Mentions', ','))))
# .distinct()
# .filter('''Mentions = 'marcmarquez93' or 
#            Mentions = 'valeyellow46' or 
#            Mentions = '26_danipedrosa'
#         ''')
# .groupBy('Mentions')
# .count() 
# .show())

[('26_danipedrosa', 12341), ('marcmarquez93', 58027), ('valeyellow46', 61103)]

### b) Contabilizar los 5 países que más tweets han publicado (considerando los tweets que contengan dicha información).


In [15]:
(df.select('COUNTRY')
 .filter(df['COUNTRY'] != 'not public')
 .groupBy('COUNTRY')
 .count()
 .orderBy('count', ascending=False)
 .take(5)
)

[Row(COUNTRY='es', count=172577),
 Row(COUNTRY='us', count=12722),
 Row(COUNTRY='gb', count=12588),
 Row(COUNTRY='id', count=8725),
 Row(COUNTRY='it', count=1843)]

### c) Contabilizar los 3 hashtags más utilizados (que aparezcan el mayor número de veces) en el cuerpo de los tweets (campo "body").


In [52]:
(df.select('BODY')
 .filter(df['BODY'].like('%#%'))
 .rdd.flatMap(lambda mentions: 
              filter(lambda x: x.startswith('#'), 
                     list(set(mentions[0].split(' ')))
                    )
             )
 .map(lambda hashtag: (hashtag, 1))
 .reduceByKey(lambda a, b: a + b)
 .takeOrdered(3, key = lambda hashtag: -hashtag[1])
)

# Alternative query
#(df.select('ID', 'BODY')
# .filter(df['BODY'].like('%#%'))
# .withColumn('BODY', explode((split('BODY', ' '))))
# .distinct()
# .filter("BODY like '#%'")
# .groupBy('BODY')
# .count()
# .orderBy('count', ascending=False)
# .take(3)
#)

[('#motogp', 51911), ('#qatar', 9974), ('#moto3', 5793)]

# Spark Streaming

## Please run first the following lines: 



cd ./kafka_2.11-0.10.2.0/
./bin/zookeeper-server-start.sh ./config/zookeeper.properties

./bin/kafka-server-start.sh ./config/server.properties

cd ./sparksql-sparkstreaming-kafka

source activate conda_env_name

python ./timestamp_kafka_producer.py Quatar_GP_2014 ./tweets.csv 

### Note: tweets.csv is sorted by date

### The following lines were provided by the teachers

In [2]:
from __future__ import print_function
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from operator import add
from operator import sub

ssc = StreamingContext(sc, 5) # 5 seconds

kafkaBrokerIPPort = "127.0.0.1:9092"

import kafka

class KafkaProducerWrapper(object):
  producer = None
  @staticmethod
  def getProducer(brokerList):
    if KafkaProducerWrapper.producer != None:
      return KafkaProducerWrapper.producer
    else:
      KafkaProducerWrapper.producer = kafka.KafkaProducer(bootstrap_servers=brokerList, key_serializer=str.encode, value_serializer=str.encode)
      return KafkaProducerWrapper.producer
 
def sendMetrics(itr):
  prod = KafkaProducerWrapper.getProducer([kafkaBrokerIPPort])
  for m in itr:
    prod.send("metrics", key=m[0], value=m[0]+","+str(m[1]))
  prod.flush()

In [3]:
import ujson

topic_name = 'Quatar_GP_2014'

kafkaParams = {"metadata.broker.list": kafkaBrokerIPPort}
stream = KafkaUtils.createDirectStream(ssc, [topic_name], kafkaParams)
stream = stream.map(lambda o: ujson.loads(o[1]))

### a) Calcular el número total de menciones recibidas por cada cuenta de usuario durante el intervalo de 5 segundos.


In [135]:
ssc.checkpoint("checkpoint")

counts = (stream.flatMap(lambda line: list(filter(lambda x: len(x) > 0, line['Mentions'].split(','))))
                .map(lambda user: (user, 1)) 
                .reduceByKey(lambda a, b: a + b))

counts.pprint()

# In case we wanted to send the result to another Kafka queue
#counts.foreachRDD(lambda rdd: rdd.foreachPartition(sendMetrics))

In [131]:
ssc.start()

### b) Calcular la frecuencia total acumulada de apariciones de cada hashtag en el campo body, actualizando un ranking con los 5 hashtags con mayor frecuencia de aparición.


In [4]:
ssc.checkpoint("checkpoint")

top_5 = (stream.flatMap(lambda line: filter(lambda word: word.startswith('#'), list(set(line['BODY'].split(' ')))))
 .map(lambda hashtag: (hashtag, 1))
 .updateStateByKey(lambda currentVal, totalVal: sum(currentVal) + totalVal if totalVal != None else sum(currentVal))
 .transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
)

top_5.pprint(5)

In [5]:
ssc.start()

-------------------------------------------
Time: 2017-06-16 17:44:25
-------------------------------------------
('#motogp', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
('#qatar', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
('#unleashthebeast...', [1, 1, 1, 1, 1, 1, 1])
('#moto2', [1, 1, 1, 1, 1, 1])
('#valentinorossi', [1, 1, 1, 1])
('#moto3', [1, 1, 1, 1])
('#motogp:', [1, 1, 1, 1])
('#gobrad!', [1, 1, 1, 1])
('#losail', [1, 1, 1, 1])
('#marquez', [1, 1, 1])
...

-------------------------------------------
Time: 2017-06-16 17:44:30
-------------------------------------------
('#motogp', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [6]:
ssc.stop(False)