# SparkSQL assignment with Python
## José Vicente Mellado

### Reading file

In [None]:
## No need to run these lines, everything has been configured in spark-env.sh

#import findspark
#findspark.init('/spark_dir')

##Configuramos el sparksession
#import pyspark
#from pyspark.sql import SparkSession

#spark = (SparkSession.builder
#         .master('local[*]')
#         .config('spark.driver.cores', 1)
#         .appName('estudio_spark')
#         .getOrCreate()
#        )
##obtenemos el sparkcontext a partir del sparksession
#sc = spark.sparkContext

In [2]:
from pyspark.sql.types import *

st = StructType([
        StructField('ID', LongType(), True),
        StructField('PARENT-SYS-ID', StringType(), True),
        StructField('Source', StringType(), True),
        StructField('Mentions', StringType(), True),
        StructField('Target', StringType(), True),
        StructField('NAME Source', StringType(), True),
        StructField('BODY', StringType(), True),
        StructField('PUBDATE', TimestampType(), True),
        StructField('URLs coma separated', StringType(), True),
        StructField('Type TW-RT-MT', StringType(), True),
        StructField('LINK', StringType(), True),
        StructField('n1 Link', ByteType(), True),
        StructField('n1 Picture', ByteType(), True),
        StructField('PERSONAL-WEBSITE', StringType(), True),
        StructField('COUNTRY', StringType(), True),
        StructField('ALL-NICK-ACTIVITY-EVER', LongType(), True),
        StructField('NICK-FOLLOWERS', LongType(), True),
        StructField('FRIENDS-FOLLOWING-AUDIENCE', LongType(), True),
        StructField('LOCATION', StringType(), True)
    ]
)

#https://spark.apache.org/docs/2.0.0-preview/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader
df = spark.read.csv('tweets.csv', 
                    header=True, 
                    schema=st,
                    timestampFormat='dd/MM/yyyy HH:mm',
                    delimiter='\t',
                    mode='PERMISSIVE')

### Unique mentions

In [78]:
from pyspark.sql.functions import udf

def list_to_string(l):
    if l != None:
        return str(l).replace('[','').replace(']','').replace('\'','').replace(' ', '')

def remove_duplicates(list_string):
    if list_string != None:
        return list(filter(lambda x: len(x)>0, list(set(list_string.split(',')))))

only_uniques = udf(lambda row: list_to_string(remove_duplicates( row )) if row != None else None, StringType())


df = df.withColumn('Mentions', 
                    only_uniques(df['Mentions'])
                  )

### a) Contabilizar el número total de menciones a los pilotos Marc Márquez, Valentino Rossi y Dani Pedrosa.



In [79]:
from pyspark.sql.functions import explode, split

(df.select('Mentions')
 .filter(df['Mentions'].like('%marcmarquez93%') | 
         df['Mentions'].like('%valeyellow46%') | 
         df['Mentions'].like('%26_danipedrosa%'))
 .select(explode((split('Mentions', ','))).alias('Accounts'))
 .filter('''Accounts = 'marcmarquez93' or 
            Accounts = 'valeyellow46' or 
            Accounts = '26_danipedrosa'
         ''')
 .groupBy('Accounts')
 .count()
 .show()
)


+--------------+-----+
|      Accounts|count|
+--------------+-----+
| marcmarquez93|58027|
|26_danipedrosa|12341|
|  valeyellow46|61103|
+--------------+-----+



### b) Contabilizar los 5 países que más tweets han publicado (considerando los tweets que contengan dicha información).


In [60]:
(df.select('COUNTRY')
 .filter(df['COUNTRY'] != 'not public')
 .groupBy('COUNTRY')
 .count()
 .orderBy('count', ascending=False)
 .take(5)
)

[Row(COUNTRY='es', count=172577),
 Row(COUNTRY='us', count=12722),
 Row(COUNTRY='gb', count=12588),
 Row(COUNTRY='id', count=8725),
 Row(COUNTRY='it', count=1843)]

### c) Contabilizar los 3 hashtags más utilizados (que aparezcan el mayor número de veces) en el cuerpo de los tweets (campo "body").


In [61]:
(df.select('BODY')
 .filter(df['BODY'].like('%#%'))
 .select(explode((split('BODY', ' '))).alias('Words'))
 .filter("Words like '#%'")
 .groupBy('Words')
 .count()
 .orderBy('count', ascending=False)
 .take(3)
)

[Row(Words='#motogp', count=51961),
 Row(Words='#qatar', count=9977),
 Row(Words='#moto3', count=5797)]

## Spark Streaming

In [None]:
# Please run first

### The following lines were provided by the teachers

In [None]:
from __future__ import print_function
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from operator import add
from operator import sub

# Crear el contexto de Spark Streaming
ssc = StreamingContext(sc, 5)

kafkaBrokerIPPort = "127.0.0.1:9092"

import kafka

class KafkaProducerWrapper(object):
  producer = None
  @staticmethod
  def getProducer(brokerList):
    if KafkaProducerWrapper.producer != None:
      return KafkaProducerWrapper.producer
    else:
      KafkaProducerWrapper.producer = kafka.KafkaProducer(bootstrap_servers=brokerList, key_serializer=str.encode, value_serializer=str.encode)
      return KafkaProducerWrapper.producer
 
def sendMetrics(itr):
  prod = KafkaProducerWrapper.getProducer([kafkaBrokerIPPort])
  for m in itr:
    prod.send("metrics", key=m[0], value=m[0]+","+str(m[1]))
  prod.flush()

In [None]:
topic_name = 'Quatar_GP_2014'

kafkaParams = {"metadata.broker.list": kafkaBrokerIPPort}
stream = KafkaUtils.createDirectStream(ssc, [topic_name], kafkaParams)
stream = stream.map(lambda o: str(o[1]))

### a) Calcular el número total de menciones recibidas por cada cuenta de usuario durante el intervalo de 5 segundos.
