# Exemplo do cálculo da frequência de palavras 

## Verificar que existe o SparkContext:

In [1]:
sc

<pyspark.context.SparkContext at 0x7f4f75f97860>

## Escolha do dataset: local ou no HDFS

### Ficheiro local:

In [43]:
#file = '/home/ABD/datasets/other/pg100.txt'
file = '/home/ABD/datasets/ch6/wiki_01'

In [41]:
#!ls -l '/home/ABD/datasets/other/pg100.txt'
!ls -l '/home/ABD/datasets/ch6/wiki_01'

-rw-rw-r-- 1 spark spark 524283134 May 19  2017 /home/ABD/datasets/ch6/wiki_01


### Ficheiro no HDFS:

In [23]:
file_hdfs = "hdfs://master:9000/home/ABD/datasets/ch6/wiki_01"

## Ler o ficheiro:

In [24]:
#text_file = sc.textFile(file_hdfs)
import time

## Contar a frequência de palavras localmente

In [47]:
t0=time.time()
text_file = sc.textFile(file)
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
number_of_words = counts.count()
t1 = time.time()
print ("Número de palavras no ficheiro: %d" % (number_of_words))
print ("O cálculo demorou %.2fs." % (t1-t0))


Número de palavras no ficheiro: 2482193
O cálculo demorou 18.23s.


## Contar a frequência de palavras localmente (HDFS)

In [48]:
t0=time.time()
text_file = sc.textFile(file_hdfs)
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
number_of_words = counts.count()
t1 = time.time()
print ("Número de palavras no ficheiro: %d" % (number_of_words))
print ("O cálculo demorou %.2fs." % (t1-t0))


Número de palavras no ficheiro: 2482193
O cálculo demorou 30.86s.


## Contar a frequência de palavras no cluster

In [52]:
%%file words_frequency.py
import pyspark
import time

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
file_hdfs = "hdfs://master:9000/home/ABD/datasets/ch6/wiki_01"
t0 = time.time()
text_file = sc.textFile(file_hdfs)
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
number_of_words = counts.count()
t1 = time.time()
print ("Número de palavras no ficheiro: %d" % (number_of_words))
print ("O cálculo demorou %.2fs." % (t1-t0))

Overwriting words_frequency.py


In [53]:
!cat words_frequency.py

import pyspark
import time

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
file_hdfs = "hdfs://master:9000/home/ABD/datasets/ch6/wiki_01"
t0 = time.time()
text_file = sc.textFile(file_hdfs)
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
number_of_words = counts.count()
t1 = time.time()
print ("Número de palavras no ficheiro: %d" % (number_of_words))
print ("O cálculo demorou %.2fs." % (t1-t0))

In [54]:
!/usr/share/spark/bin/spark-submit \
--master spark://192.168.1.105:7077 \
words_frequency.py

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
17/12/14 10:25:34 INFO SparkContext: Running Spark version 2.1.0
17/12/14 10:25:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/12/14 10:25:35 INFO SecurityManager: Changing view acls to: joao
17/12/14 10:25:35 INFO SecurityManager: Changing modify acls to: joao
17/12/14 10:25:35 INFO SecurityManager: Changing view acls groups to: 
17/12/14 10:25:35 INFO SecurityManager: Changing modify acls groups to: 
17/12/14 10:25:35 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(joao); groups with view permissions: Set(); users  with modify permissions: Set(joao); groups with modify permissions: Set()
17/12/14 10:25:35 INFO Utils: Successfully started service 'sparkDriver' on port 39487.
17/12/14 10:25:35 INFO SparkEnv: Registering MapOutputTracker
17/12/14 10:25:35 INF

17/12/14 10:25:39 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.1.100:33820 with 366.3 MB RAM, BlockManagerId(2, 192.168.1.100, 33820, None)
17/12/14 10:25:39 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.1.102:33729 with 366.3 MB RAM, BlockManagerId(0, 192.168.1.102, 33729, None)
17/12/14 10:25:39 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 192.168.1.101:46705 (size: 6.1 KB, free: 366.3 MB)
17/12/14 10:25:39 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.1.101:46705 (size: 22.9 KB, free: 366.3 MB)
17/12/14 10:26:11 INFO TaskSetManager: Finished task 3.0 in stage 0.0 (TID 3) in 32595 ms on 192.168.1.101 (executor 1) (1/4)
17/12/14 10:26:14 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 35557 ms on 192.168.1.101 (executor 1) (2/4)
17/12/14 10:26:16 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 37317 ms on 192.168.1.101 (executor 1) (3/4)
17/12/14 10:26:16 INFO TaskSetManager

## Opcional: gravar para ficheiro (local ou hdfs)

In [None]:
counts.saveAsTextFile("hdfs://...")