In [17]:
%%file task2.py

from mrjob.job import MRJob, MRStep
from collections import defaultdict

class MRLongestPhrase(MRJob):
    def mapper(self, _, line):
        if not line.startswith('"character" "dialogue"'):
            _, character, phrase = line.split(" ", 2)
            c = character.strip('"').strip("\\")
            p = phrase.strip('"').strip("\\")
            yield c, len(p)

    def reducer_aggregate(self, character, lengths):
        yield None, (character, max(lengths))

    def reducer(self, _, pairs):
        char2len = [(p[0], p[1]) for p in pairs]
        yield from sorted(char2len, key=lambda x: -x[1])

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                reducer=self.reducer_aggregate,
            ),
            MRStep(reducer=self.reducer)
        ]

if __name__ == "__main__":
    MRLongestPhrase.run()

Overwriting task2.py


In [40]:
!python3 task2.py ../sw-data/SW_EpisodeIV.txt > res4_local.txt && cat res4_local.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task2.root.20231206.202820.981575
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/task2.root.20231206.202820.981575/output
Streaming final output from /tmp/task2.root.20231206.202820.981575/output...
Removing temp directory /tmp/task2.root.20231206.202820.981575...
"LEIA"	596
"BIGGS"	356
"DODONNA"	353
"JABBA"	339
"LUKE"	318
"TARKIN"	302
"THREEPIO"	288
"BEN"	262
"VADER"	257
"HAN"	256
"MOTTI"	228
"OFFICER"	219
"GREEDO"	203
"OWEN"	191
"SECOND"	187
"TAGGE"	183
"FIXER"	177
"RED"	176
"VOICE"	148
"DEATH"	143
"COMMANDER"	112
"MASSASSI"	107
"HUMAN"	107
"REBEL"	106
"TROOPER"	101
"IMPERIAL"	99
"AUNT"	98
"WEDGE"	98
"CONTROL"	97
"ASTRO-OFFICER"	96
"GOLD"	93
"BASE"	91
"WILLARD"	90
"GANTRY"	88
"INTERCOM"	87
"CAPTAIN"	77
"MAN"	76
"BERU"	72
"FIRST"	72
"BARTENDER"	68
"CHIEF"	65
"CAMIE"	38
"TECHNICIAN"	36
"WOMAN"	32
"CREATURE"	28
"DEAK"	22
"PORKINS"	20
"WINGMAN"	9


In [41]:
!python3 task2.py ../sw-data/SW_EpisodeV.txt > res5_local.txt && cat res5_local.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task2.root.20231206.202853.803036
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/task2.root.20231206.202853.803036/output
Streaming final output from /tmp/task2.root.20231206.202853.803036/output...
Removing temp directory /tmp/task2.root.20231206.202853.803036...
"YODA"	385
"VADER"	269
"LEIA"	264
"THREEPIO"	238
"LANDO"	206
"VEERS"	205
"LUKE"	198
"PIETT"	176
"REBEL"	167
"NEEDA"	160
"HAN"	152
"RIEEKAN"	121
"ZEV"	118
"BEN"	117
"DECK"	115
"CREATURE"	109
"OZZEL"	88
"DERLIN"	88
"CONTROLLER"	85
"DACK"	82
"LIEUTENANT"	78
"SECOND"	77
"EMPEROR"	75
"MEDICAL"	72
"ASSISTANT"	71
"SENIOR"	69
"TRACKING"	67
"COMMUNICATIONS"	64
"INTERCOM"	64
"BOBA"	58
"IMPERIAL"	53
"ANNOUNCER"	50
"HEAD"	47
"WEDGE"	43
"WOMAN"	43
"TRENCH"	43
"HOBBIE"	38
"CAPTAIN"	33
"PILOT"	29
"JANSON"	25
"STRANGE"	23
"FIRST"	22
"OFFICER"	14
"MAN"	12
"PILOTS"	12


In [42]:
!python3 task2.py ../sw-data/SW_EpisodeVI.txt > res6_local.txt && cat res6_local.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task2.root.20231206.202854.840283
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/task2.root.20231206.202854.840283/output
Streaming final output from /tmp/task2.root.20231206.202854.840283/output...
Removing temp directory /tmp/task2.root.20231206.202854.840283...
"BEN"	773
"ACKBAR"	503
"LUKE"	502
"MON"	460
"YODA"	293
"EMPEROR"	227
"VADER"	207
"GENERAL"	183
"THREEPIO"	183
"COMMANDER"	165
"DEATH"	158
"HAN"	154
"LANDO"	149
"NINEDENINE"	145
"HAN/PILOT"	130
"PIETT"	126
"SHUTTLE"	124
"LEIA"	110
"CONTROLLER"	99
"JABBA"	94
"ANAKIN"	83
"WEDGE"	79
"JERJERROD"	76
"CONTROL"	63
"GUARD"	62
"OFFICER"	59
"BOUSHH"	54
"PILOT"	53
"RED"	48
"SCOUT"	48
"STORMTROOPER"	42
"BIB"	40
"BUNKER"	38
"OOLA"	37
"GREEN"	34
"REBEL"	33
"GRAY"	33
"STRANGE"	28
"SECOND"	21
"NAVIGATOR"	18
"VOICE"	18
"WALKER"	16
"Y-WING"	16
"LURE"	15
"OPERATOR"	9


In [43]:
!python3 task2.py ../sw-data/SW_full.txt > res_full_local.txt && cat res_full_local.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task2.root.20231206.202857.094559
Running step 1 of 2...
Running step 2 of 2...
job output is in /tmp/task2.root.20231206.202857.094559/output
Streaming final output from /tmp/task2.root.20231206.202857.094559/output...
Removing temp directory /tmp/task2.root.20231206.202857.094559...
"BEN"	773
"LEIA"	596
"ACKBAR"	503
"LUKE"	502
"MON"	460
"YODA"	385
"BIGGS"	356
"DODONNA"	353
"JABBA"	339
"TARKIN"	302
"THREEPIO"	288
"VADER"	269
"HAN"	256
"MOTTI"	228
"EMPEROR"	227
"OFFICER"	219
"LANDO"	206
"VEERS"	205
"GREEDO"	203
"OWEN"	191
"SECOND"	187
"TAGGE"	183
"GENERAL"	183
"FIXER"	177
"RED"	176
"PIETT"	176
"REBEL"	167
"COMMANDER"	165
"NEEDA"	160
"DEATH"	158
"VOICE"	148
"NINEDENINE"	145
"HAN/PILOT"	130
"SHUTTLE"	124
"RIEEKAN"	121
"ZEV"	118
"DECK"	115
"CREATURE"	109
"MASSASSI"	107
"HUMAN"	107
"TROOPER"	101
"CONTROLLER"	99
"IMPERIAL"	99
"AUNT"	98
"WEDGE"	98
"CONTROL"	97
"ASTRO-OFFIC

In [28]:
!python3 task2.py -r hadoop hdfs://namenode:8020/SW_EpisodeIV.txt --output /task2_ep4

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/task2.root.20231206.202448.614100
uploading working dir files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202448.614100/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202448.614100/files/
Running step 1 of 2...
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  packageJobJar: [/tmp/hadoop-unjar6236805987017176038/] [] /tmp/streamjob8464432376335410464.jar tmpDir=null
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Disabling

In [29]:
!python3 task2.py -r hadoop hdfs://namenode:8020/SW_EpisodeV.txt --output /task2_ep5

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/task2.root.20231206.202530.160441
uploading working dir files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202530.160441/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202530.160441/files/
Running step 1 of 2...
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  packageJobJar: [/tmp/hadoop-unjar4533139531119239227/] [] /tmp/streamjob6404125329565816690.jar tmpDir=null
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Disabling

In [30]:
!python3 task2.py -r hadoop hdfs://namenode:8020/SW_EpisodeVI.txt --output /task2_ep6

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/task2.root.20231206.202610.437581
uploading working dir files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202610.437581/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202610.437581/files/
Running step 1 of 2...
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  packageJobJar: [/tmp/hadoop-unjar423564120774821486/] [] /tmp/streamjob4786447461401565650.jar tmpDir=null
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Disabling 

In [31]:
!python3 task2.py -r hadoop hdfs://namenode:8020/SW_full.txt --output /task2_full

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.3.6
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar
Creating temp directory /tmp/task2.root.20231206.202652.631164
uploading working dir files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202652.631164/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/task2.root.20231206.202652.631164/files/
Running step 1 of 2...
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  packageJobJar: [/tmp/hadoop-unjar4042618446059329925/] [] /tmp/streamjob1666460385831025531.jar tmpDir=null
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Connecting to ResourceManager at resourcemanager/172.21.0.4:8032
  Disabling

In [36]:
!hadoop fs -get /task2_ep4/part-00000 ./res_ep4_hadoop.txt && cat ./res_ep4_hadoop.txt

2023-12-06 23:27:53 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
"LEIA"	596
"BIGGS"	356
"DODONNA"	353
"JABBA"	339
"LUKE"	318
"TARKIN"	302
"THREEPIO"	288
"BEN"	262
"VADER"	257
"HAN"	256
"MOTTI"	228
"OFFICER"	219
"GREEDO"	203
"OWEN"	191
"SECOND"	187
"TAGGE"	183
"FIXER"	177
"RED"	176
"VOICE"	148
"DEATH"	143
"COMMANDER"	112
"MASSASSI"	107
"HUMAN"	107
"REBEL"	106
"TROOPER"	101
"IMPERIAL"	99
"WEDGE"	98
"AUNT"	98
"CONTROL"	97
"ASTRO-OFFICER"	96
"GOLD"	93
"BASE"	91
"WILLARD"	90
"GANTRY"	88
"INTERCOM"	87
"CAPTAIN"	77
"MAN"	76
"FIRST"	72
"BERU"	72
"BARTENDER"	68
"CHIEF"	65
"CAMIE"	38
"TECHNICIAN"	36
"WOMAN"	32
"CREATURE"	28
"DEAK"	22
"PORKINS"	20
"WINGMAN"	9


In [37]:
!hadoop fs -get /task2_ep5/part-00000 ./res_ep5_hadoop.txt && cat ./res_ep5_hadoop.txt

2023-12-06 23:27:54 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
"YODA"	385
"VADER"	269
"LEIA"	264
"THREEPIO"	238
"LANDO"	206
"VEERS"	205
"LUKE"	198
"PIETT"	176
"REBEL"	167
"NEEDA"	160
"HAN"	152
"RIEEKAN"	121
"ZEV"	118
"BEN"	117
"DECK"	115
"CREATURE"	109
"OZZEL"	88
"DERLIN"	88
"CONTROLLER"	85
"DACK"	82
"LIEUTENANT"	78
"SECOND"	77
"EMPEROR"	75
"MEDICAL"	72
"ASSISTANT"	71
"SENIOR"	69
"TRACKING"	67
"INTERCOM"	64
"COMMUNICATIONS"	64
"BOBA"	58
"IMPERIAL"	53
"ANNOUNCER"	50
"HEAD"	47
"WOMAN"	43
"WEDGE"	43
"TRENCH"	43
"HOBBIE"	38
"CAPTAIN"	33
"PILOT"	29
"JANSON"	25
"STRANGE"	23
"FIRST"	22
"OFFICER"	14
"PILOTS"	12
"MAN"	12


In [38]:
!hadoop fs -get /task2_ep6/part-00000 ./res_ep6_hadoop.txt && cat ./res_ep6_hadoop.txt

2023-12-06 23:27:56 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
"BEN"	773
"ACKBAR"	503
"LUKE"	502
"MON"	460
"YODA"	293
"EMPEROR"	227
"VADER"	207
"THREEPIO"	183
"GENERAL"	183
"COMMANDER"	165
"DEATH"	158
"HAN"	154
"LANDO"	149
"NINEDENINE"	145
"HAN/PILOT"	130
"PIETT"	126
"SHUTTLE"	124
"LEIA"	110
"CONTROLLER"	99
"JABBA"	94
"ANAKIN"	83
"WEDGE"	79
"JERJERROD"	76
"CONTROL"	63
"GUARD"	62
"OFFICER"	59
"BOUSHH"	54
"PILOT"	53
"SCOUT"	48
"RED"	48
"STORMTROOPER"	42
"BIB"	40
"BUNKER"	38
"OOLA"	37
"GREEN"	34
"REBEL"	33
"GRAY"	33
"STRANGE"	28
"SECOND"	21
"VOICE"	18
"NAVIGATOR"	18
"Y-WING"	16
"WALKER"	16
"LURE"	15
"OPERATOR"	9


In [39]:
!hadoop fs -get /task2_full/part-00000 ./res_full_hadoop.txt && cat ./res_full_hadoop.txt

2023-12-06 23:27:56 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
"BEN"	773
"LEIA"	596
"ACKBAR"	503
"LUKE"	502
"MON"	460
"YODA"	385
"BIGGS"	356
"DODONNA"	353
"JABBA"	339
"TARKIN"	302
"THREEPIO"	288
"VADER"	269
"HAN"	256
"MOTTI"	228
"EMPEROR"	227
"OFFICER"	219
"LANDO"	206
"VEERS"	205
"GREEDO"	203
"OWEN"	191
"SECOND"	187
"TAGGE"	183
"GENERAL"	183
"FIXER"	177
"RED"	176
"PIETT"	176
"REBEL"	167
"COMMANDER"	165
"NEEDA"	160
"DEATH"	158
"VOICE"	148
"NINEDENINE"	145
"HAN/PILOT"	130
"SHUTTLE"	124
"RIEEKAN"	121
"ZEV"	118
"DECK"	115
"CREATURE"	109
"MASSASSI"	107
"HUMAN"	107
"TROOPER"	101
"IMPERIAL"	99
"CONTROLLER"	99
"WEDGE"	98
"AUNT"	98
"CONTROL"	97
"ASTRO-OFFICER"	96
"GOLD"	93
"BASE"	91
"WILLARD"	90
"OZZEL"	88
"GANTRY"	88
"DERLIN"	88
"INTERCOM"	87
"ANAKIN"	83
"DACK"	82
"LIEUTENANT"	78
"CAPTAIN"	77
"MAN"	76
"JERJERROD"	76
"MEDICAL"	72
"FIRST"	72
"BERU"	72
"ASSISTANT"	71
"SENIOR"	69
"BARTENDER"	68
"TRACKING"	67
"CH