## Пишем код

In [63]:
%%file wordcount.py
# %%file is an Ipython magic function that saves the code cell as a file

from mrjob.job import MRJob # import the mrjob library

from mrjob.step import MRStep

class MRDialogueWordCount(MRJob):
    def mapper(self, _, line):
        columns = line.strip().split('" "')
        if len(columns) == 3:
            character = columns[1]
            yield character, 1  # Emit each character with count 1
 
    def reducer_count(self, key, values):
        yield None, (sum(values), key)  # Emit tuples of (count, character)

    def reducer_sort(self, _, count_character_pairs):
        sorted_pairs = sorted(count_character_pairs, reverse=True)  # Sort by count in descending order
        for count, character in sorted_pairs:
            if count > 20:  # Filter characters with count > 20
                yield character, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer_count),
            MRStep(reducer=self.reducer_sort)
        ]


if __name__ == "__main__":
    MRDialogueWordCount.run()

Overwriting wordcount.py


## Протестируем локально

In [64]:
!python3 wordcount.py SW/1.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/wordcount.root.20231206.031028.577147
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/wordcount.root.20231206.031028.577147/output
Streaming final output from /tmp/wordcount.root.20231206.031028.577147/output...
"LUKE"	254
"HAN"	153
"THREEPIO"	119
"BEN"	82
"LEIA"	57
"VADER"	41
"RED LEADER"	37
"BIGGS"	34
"TARKIN"	28
"OWEN"	25
Removing temp directory /tmp/wordcount.root.20231206.031028.577147...


## Запустим на кластере

### 1

In [65]:
!python3 wordcount.py -r hadoop hdfs://namenode:8020/tmp/1.txt

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/wordcount.root.20231206.031232.843045
uploading working dir files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031232.843045/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031232.843045/files/
Running step 1 of 2...
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 17: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 19: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 32: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 42: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 46: $'

### 2

In [66]:
!python3 wordcount.py -r hadoop hdfs://namenode:8020/tmp/2.txt

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/wordcount.root.20231206.031620.122559
uploading working dir files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031620.122559/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031620.122559/files/
Running step 1 of 2...
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 17: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 19: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 32: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 42: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 46: $'

### 3

In [68]:
!python3 wordcount.py -r hadoop hdfs://namenode:8020/tmp/3.txt

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/wordcount.root.20231206.031934.672117
uploading working dir files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031934.672117/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.031934.672117/files/
Running step 1 of 2...
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 17: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 19: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 32: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 42: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 46: $'

### Отдельно для всех файлов

In [72]:
import pyarrow.hdfs

 
fs = pyarrow.hdfs.connect(host='namenode', port=8020)

file1_path = '/tmp/1.txt'
file2_path = '/tmp/2.txt'
file3_path = '/tmp/3.txt'
output_file_path = '/tmp/all.txt'

def merge_files(file1, file2, file3, output_file):
    with fs.open(file1, 'rb') as f1, fs.open(file2, 'rb') as f2, fs.open(file3, 'rb') as f3:
        data1 = f1.read().decode('utf-8')
        data2 = f2.read().decode('utf-8')
        data3 = f3.read().decode('utf-8')

        merged_data = data1 + '\n' + data2 + '\n' + data3

        with fs.open(output_file, 'wb') as output:
            output.write(merged_data.encode('utf-8'))

merge_files(file1_path, file2_path, file3_path, output_file_path)

  fs = pyarrow.hdfs.connect(host='namenode', port=8020)


In [73]:
!python3 wordcount.py -r hadoop hdfs://namenode:8020/tmp/all.txt

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/wordcount.root.20231206.032720.937620
uploading working dir files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.032720.937620/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/wordcount.root.20231206.032720.937620/files/
Running step 1 of 2...
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 17: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 19: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 32: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 42: $'\r': command not found
  /opt/hadoop/etc/hadoop/hadoop-env.sh: line 46: $'