# Part 1

In [388]:
%%file rc1.py
from mrjob.job import MRJob
import re

class Part1(MRJob):
    def mapper_init(self):
        self.decades = {}

    def mapper(self, _, line):
        year = re.findall(r'#t(\d+)', line)
        if year:
            year = int(year[0])
            decade = (year // 10) * 10
            if decade not in self.decades:
                self.decades[decade] = 0
            self.decades[decade] += 1

    def mapper_final(self):
        for decade, count in self.decades.items():
            yield decade, count
    def reducer(self, decade, counts):
        yield decade, sum(counts)

if __name__ == '__main__':
    Part1.run()


Writing rc1.py


In [389]:
!python rc.py citation.txt

1970	4
1980	3
1990	11
2000	82


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc.PC.20240212.181929.826545
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc.PC.20240212.181929.826545\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc.PC.20240212.181929.826545\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc.PC.20240212.181929.826545...


In [390]:
!python rc1.py citation.txt > output_rc1.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc1.PC.20240212.181936.229546
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc1.PC.20240212.181936.229546\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc1.PC.20240212.181936.229546\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc1.PC.20240212.181936.229546...


# Part 2

In [210]:
%%file rc2.py
from mrjob.job import MRJob
import re
class Part2(MRJob):

    def mapper_init(self):
        self.year_titles = {}

    def mapper(self, _, line):
        
        year_match = re.search(r'#t(\d{4})', line)  
        title_match = re.search(r'(.+)(?=#@)', line) 
        if year_match and title_match:
            year = int(year_match.group(1))
            title = title_match.group(1).strip().lstrip('#*')
            if year not in self.year_titles:
                self.year_titles[year] = []
            self.year_titles[year].append(title)

    def mapper_final(self):
        for year, titles in self.year_titles.items():
            yield year, titles
    def reducer(self, year, titles_lists):
        combined= []
        for titles in titles_lists:
            for title in titles:
                combined.append(title)
        combined_titles_str = ' , '.join(combined)
        yield year, combined_titles_str
if __name__ == '__main__':
    Part2.run()

Overwriting rc2.py


In [211]:
!python rc2.py citation.txt

1973	"Notes from industry"
1975	"A control word model for detecting conflicts between microoperations"
1976	"Microprogramming for the hardware engineer"
1978	"Design team composition for high level language computer architectures"
1982	"Review of \"Bit-Slice Microprocessor Design by John Mick and James Brick\", McGraw-Hill Book Company, 1980"
1985	"Word Processing on Your MacIntosh"
1987	"Type Graphics and MacIntosh"
1991	"Tarski's World 3.0: Including the Macintosh TM Program (Center for the Study of Language and Information - Lecture Notes)"
1993	"Hyperstat: Macintosh Hypermedia for Analyzing Data and Learning Statistics"
1994	"At Ease With Performa , It's a Mad, Mad, Mad, Mad Mac\/Book and Disk , Operations Research: Macintosh Version (Business Statistics Series)"
1995	"Internet and HTML Training on CD-ROM"
1996	"Fast k-NN Classification Rule Using Metrics on Space-Filling Curves , A New Quadtree Decomposition Reconstruction Method"
1997	"Multimedia Directory 1997 , Elsevier's Dicti

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.161631.517394
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.161631.517394\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.161631.517394\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.161631.517394...


In [365]:
!python rc2.py citation.txt > output_rc2.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.181026.514376
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.181026.514376\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.181026.514376\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc2.PC.20240212.181026.514376...


# Part 3

In [256]:
%%file rc3.py
from mrjob.job import MRJob
import re

class Part3(MRJob):
    def mapper_init(self):
        self.co_authors= {}
    def mapper(self, _, line):
        authors_match = re.search(r'#@\s*(.*)#t', line)
        if authors_match:
            authors = authors_match.group(1).split(',')
            for author in authors:
                author = author.strip()
                if author not in self.co_authors:
                    self.co_authors[author] = set()
                self.co_authors[author].update(set(authors) - {author})   
    def mapper_final(self):
        for author, co_authors in self.co_authors.items():
            if author:
                yield author, list(co_authors)
    def reducer(self, author, co_authors_lists):
        co_authors = set()
        for co_authors_list in co_authors_lists:
            co_authors.update(co_authors_list)
        yield author, list(co_authors)

if __name__ == '__main__':
    Part3.run()



Overwriting rc3.py


In [257]:
!python rc3.py citation.txt

"A. Krzyzak"	["E. Skubalska-Rafajtowicz"]
"Ahmed Hassan"	["Parminder Flora"]
"Alexander Gelbukh"	["Carlos Alberto Reyes-Garcia "]
"Alice Redmond-neal"	["Marjorie M. K. Hlava"]
"Allan Hunkin"	[]
"Amir Ahmad"	["Lipika Dey "]
"Amitabh Chaudhary"	["Christian Scheideler ","Ankur Bhargava","Amitabha Bagchi","David Eppstein"]
"Amitabha Bagchi"	["Christian Scheideler ","Ankur Bhargava","Amitabh Chaudhary","David Eppstein"]
"Andreas N\u00fcrnberger"	["Marcin Detyniecki "]
"Anita Kesavan"	["Neil Daswani"]
"Ankur Bhargava"	["Christian Scheideler ","Amitabh Chaudhary","Amitabha Bagchi","David Eppstein"]
"Arthur Greef"	["Hans J. Skovgaard","Michael Fruergaard Pontoppidan","Palle Agermark","Lars Dragheim Olsen"]
"Axel Bucker"	[]
"Barry Smyth"	["Vincent Wade","Helen Ashman"]
"Bart Preneel"	["Bart Preneel "]
"Ben Long"	["Ben Long "]
"Brenden Munnelly"	["Paul Holden "]
"Bruce Shriver"	["Ted Lewis","Bruce Shriver "]
"Carla Rose"	["Carla Rose "]
"Carlito Vicencio"	["Darrel Creacy"]
"Carlos Alberto Reyes-

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.163813.150318
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.163813.150318\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.163813.150318\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.163813.150318...


In [366]:
!python rc3.py citation.txt > output_rc3.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.181109.229094
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.181109.229094\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.181109.229094\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc3.PC.20240212.181109.229094...


# Part 4

In [264]:
%%file rc4.py
from mrjob.job import MRJob
import re

class PaperCount(MRJob):

    def mapper_init(self):
        self.publish_counts = {}
    def mapper(self, _, line):
        year_match = re.search(r'#t(\d+)', line)
        if year_match:
            year = int(year_match.group(1))
            if year not in self.publish_counts:
                self.publish_counts[year] = 0
            self.publish_counts[year] += 1

    def mapper_final(self):
        for year, count in self.publish_counts.items():
            yield year, (count,1)

    def reducer(self, year, counts):
        total_papers = 0
        total_years = 0
        for count,i in counts:
            total_years += 1
            total_papers += count
        avg = total_papers / total_years
        yield year, avg

if __name__ == '__main__':
    PaperCount.run()


Overwriting rc4.py


In [265]:
!python rc4.py citation.txt

1973	1.0
1975	1.0
1976	1.0
1978	1.0
1982	1.0
1985	1.0
1987	1.0
1991	1.0
1993	1.0
1994	1.5
1995	1.0
1996	1.0
1997	1.0
1999	1.0
2000	1.0
2001	1.0
2002	1.0
2003	1.3333333333333333
2004	1.8
2005	2.5714285714285716
2006	4.5
2007	1.375


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.170319.811171
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.170319.811171\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.170319.811171\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.170319.811171...


In [367]:
!python rc4.py citation.txt > output_rc4.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.181130.248212
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.181130.248212\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.181130.248212\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc4.PC.20240212.181130.248212...


# Part 5

In [323]:
%%file rc5.py
from mrjob.job import MRJob
import re

class Part5(MRJob):

    def mapper_init(self):
        self.author_counts = {}

    def mapper(self, _, line):
        author_match = re.search(r'#@\s*(.*)#t', line)
        if author_match:
            authors = author_match.group(1).split(',')
            for author in authors:
                author = author.strip()
                if author not in self.author_counts:
                    self.author_counts[author] = 0
                self.author_counts[author] += 1

    def mapper_final(self):
        for author, count in self.author_counts.items():
            yield None, (author, count)

    def reducer(self, _, author_counts):
        author_counts = list(author_counts)

        max_papers = 0
        max_authors = []

        for author, count in author_counts:
            if author:  
                if count > max_papers:
                    max_papers = count

        for author, count in author_counts:
            if count == max_papers and author:  
                max_authors.append(author)

        if max_authors:
            for author in max_authors:
                yield "Author(s) with the maximum number of papers:", author
        else:
            yield "No authors found with the maximum number of papers:", ""

if __name__ == '__main__':
    Part5.run()

   



Overwriting rc5.py


In [368]:
!python rc5.py citation.txt

"Author(s) with the maximum number of papers:"	"Hoon Hong"
"Author(s) with the maximum number of papers:"	"Dongming Wang"
"Author(s) with the maximum number of papers:"	"Charles J. Brooks"
"Author(s) with the maximum number of papers:"	"Ahmed Hassan"
"Author(s) with the maximum number of papers:"	"Parminder Flora"
"Author(s) with the maximum number of papers:"	"Darrel Creacy"
"Author(s) with the maximum number of papers:"	"Carlito Vicencio"
"Author(s) with the maximum number of papers:"	"Neil Daswani"
"Author(s) with the maximum number of papers:"	"Anita Kesavan"
"Author(s) with the maximum number of papers:"	"Shinto Eguchi"
"Author(s) with the maximum number of papers:"	"John Copas"
"Author(s) with the maximum number of papers:"	"Lee Humphreys"
"Author(s) with the maximum number of papers:"	"Paul Messaris"
"Author(s) with the maximum number of papers:"	"Oliver Kohlbacher"
"Author(s) with the maximum number of papers:"	"Knut Reinert"
"Author(s) with the maximum number of papers:"	"Clem

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181146.194359
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181146.194359\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181146.194359\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181146.194359...


In [370]:
!python rc5.py citation.txt > output_rc5.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181206.818999
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181206.818999\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181206.818999\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc5.PC.20240212.181206.818999...


# Part 6

In [344]:
%%file rc6.py

from mrjob.job import MRJob
import re

class AuthorsWithOnePaperPerYear(MRJob):

    def mapper_init(self):
        self.author_paper_count = {}

    def mapper(self, _, line):
        author_match = re.search(r'#@\s*(.*)#t(\d+)', line)
        if author_match:
            authors = author_match.group(1).split(',')
            year = int(author_match.group(2))
            if year not in self.author_paper_count:
                self.author_paper_count[year] = {}
            for author in authors:
                author = author.strip()
                self.author_paper_count[year][author] = self.author_paper_count[year].get(author, 0) + 1

    def mapper_final(self):
        for year, author_count_dict in self.author_paper_count.items():
            for author, count in author_count_dict.items():
                yield (year, author), count

    def reducer(self, author_year, counts):
        total_papers = sum(counts)
        if total_papers <= 1:
            yield author_year[0], author_year[1]

if __name__ == '__main__':
    AuthorsWithOnePaperPerYear.run()
       


Overwriting rc6.py


In [345]:
!python rc6.py citation.txt

1973	"Stanley Habib"
1975	"Bruce Shriver"
1975	"Ted Lewis"
1976	"John R. Mick"
1978	"Charles S. Wetherell"
1978	"James R. McGraw"
1978	"Jr."
1978	"Lyle A. Cox"
1982	"William J. Tracz"
1985	"Rudolph Langer"
1987	"John Blaint"
1991	"John Etchemendy"
1991	"Jon Barwise"
1993	"David M. Lane"
1994	"Carla Rose"
1994	"Gene Orwell"
1994	"Wayne L. Winston"
1995	""
1996	"A. Krzyzak"
1996	"E. Skubalska-Rafajtowicz"
1996	"J. Knipe"
1996	"X. Li"
1997	""
1997	"W. E. Clason"
1999	"Donald Christiansen"
2000	""
2000	"Ken Abernethy"
2001	""
2002	"Brenden Munnelly"
2002	"Paul Holden"
2003	"Charles J. Brooks"
2003	"John Odam"
2003	"Jose Pedro Llamazares"
2003	"Michael Cloran"
2004	"Andreas N\u00fcrnberger"
2004	"Deborah Timmons"
2004	"Denise Seguin"
2004	"Derrick Story"
2004	"John Preston"
2004	"Marcin Detyniecki"
2004	"Michael T. Goodrich"
2004	"Nita Hewitt Rutkosky"
2004	"Sally Preston"
2004	"Shelley Gaskin"
2004	"Tom Collins"
2005	""
2005	"Alice Redmond-neal"
2005	"Axel Bucker"
2005	"Carlito Vicencio"
2

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.175824.833528
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.175824.833528\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.175824.833528\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.175824.833528...


In [371]:
!python rc6.py citation.txt > output_rc6.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.181240.809053
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.181240.809053\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.181240.809053\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc6.PC.20240212.181240.809053...


In [361]:
%%file rc7.py
from mrjob.job import MRJob
import re

class Part7(MRJob):

    def mapper_init(self):
        self.titles = []

    def mapper(self, _, line):
        title_match = re.search(r'#\*\s*(.*)\s*#@', line)
        venue_match = re.search(r'#c\s*(.*)', line)
        if title_match and not venue_match:
            title = title_match.group(1)
            self.titles.append(title)

    def mapper_final(self):
        for title in self.titles:
            yield None, title

    def reducer_init(self):
        self.titles = []

    def reducer(self, _, titles):
        for title in titles:
            self.titles.append(title)

    def reducer_final(self):
        for title in self.titles:
            yield title,None

if __name__ == '__main__':
    Part7.run()



Overwriting rc7.py


In [363]:
!python rc7.py citation.txt > output_rc7.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\PC\AppData\Local\Temp\rc7.PC.20240212.180848.214888
Running step 1 of 1...
job output is in C:\Users\PC\AppData\Local\Temp\rc7.PC.20240212.180848.214888\output
Streaming final output from C:\Users\PC\AppData\Local\Temp\rc7.PC.20240212.180848.214888\output...
Removing temp directory C:\Users\PC\AppData\Local\Temp\rc7.PC.20240212.180848.214888...


In [360]:
output_rc7 = !python rc7.py citation.txt

# Combining all output files into one

In [400]:
file_contents = []
for i in range(1, 8):
    with open(f'output_rc{i}.txt', 'r') as file:
        file_contents.append(f'Part{i} output is :')
        file_contents.append(file.read())
with open('outputff.txt', 'w') as combined_file:
    for content in file_contents:
        combined_file.write(content + '\n')