In [1]:
!pip install mrjob

[0m

In [5]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.9/773.9 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [3]:
%%file task_3.py

import nltk

from mrjob.job import MRJob
from mrjob.step import MRStep

class frequencyAnalysis(MRJob):
    """
    Маппер:
    1. Чистим от пунктуации и приводим к нижнему регистру
    2. Разбиваем на bigram'ы
    
    Редусер:
    1. Считаем частоты
    2. Сортируем по частоте в порядке убывания
    3. Выводим первые 20
    """

    def mapper_init(self):
        nltk.download('punkt')
        nltk.download('stopwords')
        self.stop_words = set(nltk.corpus.stopwords.words('english'))

    def mapper(self, _, line):
        parts = line.split(" ", 2)
        if len(parts) == 3:
            _, characterName, phrase = parts
            phrase = phrase.lower()
            phrase = nltk.word_tokenize(phrase)
            phrase = [w for w in phrase if w not in self.stop_words and w.isalpha()]
            for bigram in nltk.bigrams(phrase):
                yield bigram, 1

    def reducer(self, bigram, count):
        yield None, (bigram, sum(count))

    def reducer_top(self, _, bigramCount):
        sortedBigrams = sorted(bigramCount, key=lambda x: x[1], reverse=True)
        for bigram, count in sortedBigrams[:20]:
            yield bigram, count

    def steps(self):
        return [
            MRStep(
                   mapper_init=self.mapper_init,
                   mapper=self.mapper,
                   reducer=self.reducer),
            MRStep(
                   reducer=self.reducer_top)
        ]

if __name__ == '__main__':
    frequencyAnalysis.run()

Overwriting task_3.py


In [6]:
!python3 task_3.py ../../ha_1/data/SW_EpisodeIV.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task_3.root.20231206.122734.691734
Running step 1 of 2...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package s

In [8]:
!python3 task_3.py ../../ha_1/data/SW_EpisodeV.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task_3.root.20231206.122800.123107
Running step 1 of 2...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

In [9]:
!python3 task_3.py ../../ha_1/data/SW_EpisodeVI.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/task_3.root.20231206.122811.666231
Running step 1 of 2...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

In [11]:
!python3 task_3.py ../../ha_1/data/SW_EpisodeIV.txt --output out_task3/SW_EpisodeIV
!python3 task_3.py ../../ha_1/data/SW_EpisodeV.txt --output out_task3/SW_EpisodeV
!python3 task_3.py ../../ha_1/data/SW_EpisodeVI.txt --output out_task3/SW_EpisodeVI
!python3 task_3.py --output out_task3/all \
    ../../ha_1/data/SW_EpisodeIV.txt \
    ../../ha_1/data/SW_EpisodeV.txt \
    ../../ha_1/data/SW_EpisodeVI.txt 

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/task_3.root.20231206.123109.674302
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download