### Apache Beam 예제 - windows 용

In [1]:
# https://beam.apache.org/
# !pip install apache-beam

In [2]:
import re
import warnings
warnings.filterwarnings('ignore')

import apache_beam as beam
from apache_beam.io import ReadFromText, WriteToText
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
import urllib

# 기존 출력 파일 삭제
import os
output_text_file = "output-00000-of-00001"
if (os.path.isfile(output_text_file)):
    os.remove(output_text_file)
    
#  git에서 kinglear.txt파일 다운로드
# url = "https://github.com/cs109/2015/blob/master/Lectures/Lecture15b/sparklect/shakes/kinglear.txt"
# urllib.request.urlretrieve(url,"kinglear.txt")

input_file = "kinglear.txt"
output_file = "output"

# 파이프라인 옵션 객체
pipeline_options = PipelineOptions()

# 아파치 빔 파이프라인 설정
with beam.Pipeline(options=pipeline_options) as p:
  lines = p | beam.io.ReadFromText(input_file)
  
  counts = (
    lines
    | 'Split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
    | 'PairWithOne' >> beam.Map(lambda x: (x,1))
    | 'GroupAndSum' >> beam.CombinePerKey(sum)
    )
  
  def format_result(word_count):
    # 튜플(word, count)를 문자열로 변환
    (word, count) = word_count
    return f"{word}: {count}"

  output = counts | beam.Map(format_result)

  output |  beam.io.WriteToText(output_file)







In [3]:
# Window OS
! type $output_text_file

# Colab
# ! cat $output_text_file

THE: 7
TRAGEDY: 1
OF: 16
KING: 1
LEAR: 1
by: 69
William: 1
Shakespeare: 1
Dramatis: 1
Personae: 1
Lear: 228
King: 54
of: 456
Britain: 2
France: 32
Duke: 30
Burgundy: 15
Cornwall: 22
Albany: 14
Earl: 11
Kent: 173
Gloucester: 50
Edgar: 36
son: 29
Edmund: 52
bastard: 7
to: 430
Curan: 3
a: 351
courtier: 1
Old: 13
Man: 11
tenant: 3
Doctor: 4
Lear's: 4
Fool: 73
Oswald: 18
steward: 1
Goneril: 31
A: 55
Captain: 6
under: 9
Edmund's: 1
command: 4
Gentlemen: 1
Herald: 2
Servants: 10
daughter: 32
Regan: 32
Cordelia: 31
Knights: 2
attending: 1
on: 91
Officers: 1
Messengers: 1
Soldiers: 6
Attendants: 8
THIS: 5
ELECTRONIC: 10
VERSION: 5
COMPLETE: 5
WORKS: 5
WILLIAM: 5
SHAKESPEARE: 5
IS: 10
COPYRIGHT: 5
BY: 15
WORLD: 5
LIBRARY: 5
INC: 5
AND: 15
PROVIDED: 5
PROJECT: 5
GUTENBERG: 5
ETEXT: 5
CARNEGIE: 5
MELLON: 5
UNIVERSITY: 5
WITH: 5
PERMISSION: 5
MACHINE: 5
READABLE: 5
COPIES: 10
MAY: 5
BE: 5
DISTRIBUTED: 10
SO: 5
LONG: 5
AS: 5
SUCH: 5
ARE: 10
FOR: 15
YOUR: 5
OR: 15
OTHERS: 5
PERSONAL: 5
USE: 5
ONLY: 5

In [4]:
output_text_file = "output-00000-of-00001"
if (os.path.isfile(output_text_file)):
    os.remove(output_text_file)
    
inputs_pattern = "kinglear.txt"
outputs_prefix = 'output'
with beam.Pipeline() as pipeline:
  (
      pipeline
      | 'Read lines' >> beam.io.ReadFromText(inputs_pattern)
      | 'Find words' >> beam.FlatMap(lambda line: re.findall(r"[a-zA-Z']+", line))
      | 'Pair words with 1' >> beam.Map(lambda word: (word, 1))
      | 'Group and sum' >> beam.CombinePerKey(sum)
      | 'Format results' >> beam.Map(lambda word_count: str(word_count))
      | 'Write results' >> beam.io.WriteToText(outputs_prefix)
  )

In [5]:
# Window OS
! type $output_text_file

# Colab
# ! cat $output_text_file

('THE', 7)
('TRAGEDY', 1)
('OF', 16)
('KING', 1)
('LEAR', 1)
('by', 69)
('William', 1)
('Shakespeare', 1)
('Dramatis', 1)
('Personae', 1)
('Lear', 228)
('King', 54)
('of', 456)
('Britain', 2)
('France', 32)
('Duke', 30)
('Burgundy', 15)
('Cornwall', 22)
('Albany', 14)
('Earl', 11)
('Kent', 173)
('Gloucester', 50)
('Edgar', 36)
('son', 29)
('Edmund', 52)
('bastard', 7)
('to', 430)
('Curan', 3)
('a', 351)
('courtier', 1)
('Old', 13)
('Man', 11)
('tenant', 3)
('Doctor', 4)
("Lear's", 4)
('Fool', 73)
('Oswald', 18)
('steward', 1)
('Goneril', 31)
('A', 55)
('Captain', 6)
('under', 9)
("Edmund's", 1)
('command', 4)
('Gentlemen', 1)
('Herald', 2)
('Servants', 10)
('daughter', 32)
('Regan', 32)
('Cordelia', 31)
('Knights', 2)
('attending', 1)
('on', 91)
('Officers', 1)
('Messengers', 1)
('Soldiers', 6)
('Attendants', 8)
('THIS', 5)
('ELECTRONIC', 10)
('VERSION', 5)
('COMPLETE', 5)
('WORKS', 5)
('WILLIAM', 5)
('SHAKESPEARE', 5)
('IS', 10)
('COPYRIGHT', 5)
('BY', 15)
('WORLD', 5)
('LIBRARY', 5)
