# ~ L200921 (Aisha Muhammad Nawaz) ~
## ~ CS4080 Mining Massive Datasets (BSCS 8A Spring 2024) ~
## ~ Assignment 1 - Map Reduce Basics (Due Date: 12 Feb 2024) ~
SUBMISSION: Upload the Source code and the output file on Google Classroom in a zip file with your roll number.

INPUT FILE: You are given an input text file named citation.txt. It contains information regarding the research papers published in various journals. The complete file Citation-network V1 can be found at https://cn.aminer.org/citation. The format of the file is as follows:

#* --- paperTitle
#@ --- Authors
#t ---- Year
#c --- publication venue
#index 00---- index id of this paper

QUESTION: Write an efficient MapReduce program for the following problems. To make your algorithm efficient, you should use combiners or in-mapper aggregation techniques that use arrays.
## SOLUTION
#### Note: GC File is the file uploaded on Google Classroom assignment post
### 1. Process the citation.txt input file and output the number of papers published in each decade: 1970s, 1980s, 1990s, 2000s, 2010s, and 2020s.

In [1]:
%%file q1.py
#*NOTE: I am assuming the decades mentioned in the question are the only ones to output the count of.
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class PapersPublishedEachDecade(MRJob):
    def configure_args(self):
        super(PapersPublishedEachDecade, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once

    def init_read_file(self):
        self.patternYear=re.compile(r"(#t[^\#]*)")
        self.papersPerDecade={
            '1970s':0,
            '1980s':0,
            '1990s':0,
            '2000s':0,
            '2010s':0,
            '2020s':0,
            'OTHERS':0
        }
    def get_papers_count(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if file:
            year=''.join(self.patternYear.findall(file[0])).replace('#t','').replace('\n','').replace(' ','')
            if(year):
                year=int(year)
                if((year>=1970) and (year<1980)):
                    year='1970s'
                elif((year>=1980) and (year<1990)):
                    year='1980s'
                elif((year>=1990) and (year<2000)):
                    year='1990s'
                elif((year>=2000) and (year<2010)):
                    year='2000s'
                elif((year>=2010) and (year<2020)):
                    year='2010s'
                elif((year>=2020) and (year<2030)):
                    year='2020s'
                else :
                    year='OTHERS' 
                self.papersPerDecade[year]=self.papersPerDecade[year]+1
            
    def final_get_papers_count(self):
        for decade,val in self.papersPerDecade.items():
            yield decade,val
            
    def sum_decades(self,decade,counts):
        yield decade,sum(counts)

    def steps(self):
        return [
            MRStep(mapper_init=self.init_read_file,
                  mapper=self.get_papers_count,
                  mapper_final=self.final_get_papers_count,
                  combiner=self.sum_decades,
                  reducer=self.sum_decades)
        ]
        
if __name__=='__main__':
    PapersPublishedEachDecade.run()

Overwriting q1.py


### Q1 GC FILE OUTPUT

In [2]:
!python q1.py citation.txt 

"1970s"	4
"1980s"	3
"1990s"	11
"2000s"	82
"2010s"	0
"2020s"	0
"OTHERS"	0


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q1.DELL.20240212.172204.817701
Running step 1 of 1...
job output is in C:\Users\DELL\AppData\Local\Temp\q1.DELL.20240212.172204.817701\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q1.DELL.20240212.172204.817701\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q1.DELL.20240212.172204.817701...


### 2. Create an inverted index of the citation file. Your inverted index will output the year followed by the comma-separated list of the titles of the papers published in that year.
Sample Output format :
Year1 -> PaperTitle, Paper Title
Year2 -> Paper Title

In [3]:
%%file q2.py
from mrjob.job import MRJob
from mrjob.step import MRStep
from itertools import chain # To flatten the list before the final merge
import re

class InvertedIndexCitations(MRJob):
    def configure_args(self):
        super(InvertedIndexCitations, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternYear=re.compile(r"(#t[^\#]*)")
        self.papersByYear={}
    def get_papers_by_year(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                paperTitle=file[0].split('#@')[0]
                year=''.join(self.patternYear.findall(file[0])).replace('#t','').replace('\n','').replace(' ','')
                if(year):
                    self.papersByYear.setdefault(year+" -> ",[]).append(paperTitle)

    def final_get_papers_by_year(self):
        for year,papers in self.papersByYear.items():
            yield year,papers
            
    def merge_papers_by_year(self,year,papers):
        yield year,list(chain.from_iterable(papers))
        
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_papers_by_year,
                  mapper_final=self.final_get_papers_by_year,
                  combiner=self.merge_papers_by_year,
                  reducer=self.merge_papers_by_year)
        ]
        
if __name__=='__main__':
    InvertedIndexCitations.run()

Overwriting q2.py


### Q2 GC FILE OUTPUT

In [4]:
!python q2.py citation.txt

"1973 -> "	["Notes from industry"]
"1975 -> "	["A control word model for detecting conflicts between microoperations "]
"1976 -> "	["Microprogramming for the hardware engineer"]
"1978 -> "	["Design team composition for high level language computer architectures "]
"1982 -> "	["Review of \"Bit-Slice Microprocessor Design by John Mick and James Brick\", McGraw-Hill Book Company, 1980 "]
"1985 -> "	["Word Processing on Your MacIntosh "]
"1987 -> "	["Type Graphics and MacIntosh"]
"1991 -> "	["Tarski's World 3.0: Including the Macintosh TM Program (Center for the Study of Language and Information - Lecture Notes) "]
"1993 -> "	["Hyperstat: Macintosh Hypermedia for Analyzing Data and Learning Statistics"]
"1994 -> "	["At Ease With Performa","It's a Mad, Mad, Mad, Mad Mac\/Book and Disk ","Operations Research: Macintosh Version (Business Statistics Series) "]
"1995 -> "	["Internet and HTML Training on CD-ROM "]
"1996 -> "	["Fast k-NN Classification Rule Using Metrics on Space-Filling Curves",

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q2.DELL.20240212.172206.992178
Running step 1 of 1...
job output is in C:\Users\DELL\AppData\Local\Temp\q2.DELL.20240212.172206.992178\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q2.DELL.20240212.172206.992178\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q2.DELL.20240212.172206.992178...


### 3. Produce a list of co-authors of each author in the given input file.
Sample Output (Author -> List of Co -authors )
David Jones -> Sam Nick, Ali Javed , Daniel Brown
Sam Nick -> David Jones, Zan Jao, Ali Javed
Ali Javed -> David Jones ,Sam Nick
Zan Jao -> Sam Nick
Daniel Brown -> David Jones

Consider the following citation given in the input file
#*Automated Deduction in Geometry #@Hoon Hong,Dongming Wang#t2006#index0
In this citation 
Hoon Hong and
Dongming Wang are coauthors of each other as they have written one paper together. So in other word if A has written a book with B then A is coauthor of B and B is coauthor of A

In [5]:
%%file q3.py
from mrjob.job import MRJob
from mrjob.step import MRStep
from itertools import chain # To flatten the list before the final merge
import re

class CitationsCoauthors(MRJob):
    def configure_args(self):
        super(CitationsCoauthors, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternAuthors=re.compile(r"\#\@[^\#\t]*")
        self.authorsCoauthors={}
    def get_authors_coauthors(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                paperTitle=file[0].split('#@')[0]
                authors=''.join(self.patternAuthors.findall(file[0])).replace('#@','').replace('\n','')
                authors=[isAuthor for isAuthor in authors.split(',') if len(isAuthor)>1]
                for author in authors:
                    author=author.strip()
                    for coAuthor in authors:
                        coAuthor=coAuthor.strip()
                        if(not(coAuthor==author)):
                            self.authorsCoauthors.setdefault(author+" - >",[]).append(coAuthor)

    def final_get_authors_coauthors(self):
        for author,coAuthor in self.authorsCoauthors.items():
            yield author,coAuthor
            
    def merge_authors_coauthors(self,author,coAuthor):
        yield author,list(chain.from_iterable(coAuthor))
        
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_authors_coauthors,
                  mapper_final=self.final_get_authors_coauthors,
                  combiner=self.merge_authors_coauthors,
                  reducer=self.merge_authors_coauthors)
        ]
        
if __name__=='__main__':
    CitationsCoauthors.run()

Overwriting q3.py


### Q3 GC FILE OUTPUT

In [6]:
!python q3.py citation.txt

"A. Krzyzak - >"	["E. Skubalska-Rafajtowicz"]
"Ahmed Hassan - >"	["Parminder Flora"]
"Alex Galis - >"	["Danny Raz","Arto Tapani Juhola","Joan Serrat-Fernandez"]
"Alexander Gelbukh - >"	["Carlos Alberto Reyes-Garcia"]
"Alice Redmond-neal - >"	["Marjorie M. K. Hlava"]
"Aline Maria Santos Andrade - >"	["Carlos Alberto Maziero","Jo\u00e3o Gabriel Silva","Fl\u00e1vio Morais de Assis Silva"]
"Amir Ahmad - >"	["Lipika Dey"]
"Amitabh Chaudhary - >"	["Amitabha Bagchi","Ankur Bhargava","David Eppstein","Christian Scheideler"]
"Amitabha Bagchi - >"	["Ankur Bhargava","Amitabh Chaudhary","David Eppstein","Christian Scheideler"]
"Andreas N\u00fcrnberger - >"	["Marcin Detyniecki"]
"Anita Kesavan - >"	["Neil Daswani"]
"Ankur Bhargava - >"	["Amitabha Bagchi","Amitabh Chaudhary","David Eppstein","Christian Scheideler"]
"Arthur Greef - >"	["Michael Fruergaard Pontoppidan","Lars Dragheim Olsen","Palle Agermark","Hans J. Skovgaard"]
"Arto Tapani Juhola - >"	["Danny Raz","Joan Serrat-Fernandez","Alex Galis"

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q3.DELL.20240212.172209.172585
Running step 1 of 1...
job output is in C:\Users\DELL\AppData\Local\Temp\q3.DELL.20240212.172209.172585\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q3.DELL.20240212.172209.172585\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q3.DELL.20240212.172209.172585...


### 4. Find the average number of papers published each year.

In [7]:
%%file q4.py
#*NOTE: I am assuming the average number of papers published each year means Total papers published / Total Distinct Years.
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class AvgPapersPublishedEachYear(MRJob):
    def configure_args(self):
        super(AvgPapersPublishedEachYear, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternYear=re.compile(r"(#t[^\#]*)")
        self.papersPerYearSum={}
    def get_papers_count(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if file:
            year=''.join(self.patternYear.findall(file[0])).replace('#t','').replace('\n','').replace(' ','')
            if(year):
                year=int(year)
                self.papersPerYearSum.setdefault(year,0)
                self.papersPerYearSum[year]=self.papersPerYearSum[year]+1
            
    def final_get_papers_count(self):
        for year,value in self.papersPerYearSum.items():
            yield year,(value,1)
            
    def sum_years_count(self,year,value):
        sumValue=0
        for val,count in value:
            sumValue=sumValue+val
        yield year,(sumValue,1)
        
    def sum_years_count_red(self,year,value):
        sumValue=0
        sumCount=1
        for val,count in value:
            sumValue=sumValue+val
            
        yield None,(sumValue,sumCount)
        
    def avg_years_count(self,year,value):
        sumValue=0
        sumCount=0
        for val,count in value:
            sumValue=sumValue+val
            sumCount=sumCount+count
        yield "Average Papers Published Each Year = ",(sumValue/sumCount)

    def steps(self):
        return [
            MRStep(mapper_init=self.init_read_file,
                  mapper=self.get_papers_count,
                  mapper_final=self.final_get_papers_count,
                  combiner=self.sum_years_count,
                  reducer=self.sum_years_count_red
                  ),
                  MRStep(reducer=self.avg_years_count)
        ]
        
if __name__=='__main__':
    AvgPapersPublishedEachYear.run()

Overwriting q4.py


### Q4 GC FILE OUTPUT

In [8]:
!python q4.py citation.txt

"Average Papers Published Each Year = "	4.545454545454546


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q4.DELL.20240212.172211.380475
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\DELL\AppData\Local\Temp\q4.DELL.20240212.172211.380475\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q4.DELL.20240212.172211.380475\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q4.DELL.20240212.172211.380475...


### 5. List the names of authors who have written the maximum number of papers.

In [9]:
%%file q5.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class CitationsAuthorsMax(MRJob):
    def configure_args(self):
        super(CitationsAuthorsMax, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternAuthors=re.compile(r"\#\@[^\#\t]*")
        self.authorsCount={}
    def get_authors_count(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                authors=''.join(self.patternAuthors.findall(file[0])).replace('#@','').replace('\n','')
                authors=[isAuthor for isAuthor in authors.split(',') if len(isAuthor)>1]
                for author in authors:
                    author=author.strip()
                    self.authorsCount.setdefault(author,0)
                    self.authorsCount[author]=self.authorsCount[author]+1

    def final_get_authors_count(self):
        for author,count in self.authorsCount.items():
            yield (author,count)
            
    def sum_authors_count_combiner(self,author,count):
        yield (author,sum(count))
        
    def sum_authors_count_reducer(self,author,count):
        yield (None,(sum(count),author))
        
    def max_authors_count(self,key,values):
        max_value = float('-inf')  # Initialize to the smallest possible value
        max_authors = []

        for count, author in values:
            if count > max_value:
                max_value = count
                max_authors = [(author, count)]
            elif count == max_value:
                max_authors.append((author, count))

        for author, count in max_authors:
              yield (author, count)
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_authors_count,
                  mapper_final=self.final_get_authors_count,
                  combiner=self.sum_authors_count_combiner,
                  reducer=self.sum_authors_count_reducer),
            MRStep(reducer=self.max_authors_count)
        ]
        
if __name__=='__main__':
    CitationsAuthorsMax.run()

Overwriting q5.py


### Q5 GC FILE OUTPUT

In [10]:
!python q5.py citation.txt

"Cay S. Horstmann"	2
"Charles J. Brooks"	2


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q5.DELL.20240212.172213.630325
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\DELL\AppData\Local\Temp\q5.DELL.20240212.172213.630325\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q5.DELL.20240212.172213.630325\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q5.DELL.20240212.172213.630325...


### 6. Find the names of authors who have written at most one paper in a year.

In [11]:
%%file q6.py
#*NOTE: I am assuming here 'at most one paper in a year' means the author has some year in which they wrote one paper (This may or may not be their max in all years)
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class CitationsAuthorsPerYearCountMaxOne(MRJob):
    def configure_args(self):
        super(CitationsAuthorsPerYearCountMaxOne, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternAuthors=re.compile(r"\#\@[^\#\t]*")
        self.patternYear=re.compile(r"(#t[^\#]*)")
        self.authorsCount={}
    def get_authors_count(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                year=''.join(self.patternYear.findall(file[0])).replace('#t','').replace('\n','').replace(' ','')
                if(year):
                    authors=''.join(self.patternAuthors.findall(file[0])).replace('#@','').replace('\n','')
                    authors=[isAuthor for isAuthor in authors.split(',') if len(isAuthor)>1]
                    for author in authors:
                        author=author.strip()
                        self.authorsCount.setdefault(author+'->'+year,0)
                        self.authorsCount[author+'->'+year]=self.authorsCount[author+'->'+year]+1

    def final_get_authors_count(self):
        for authorYear,count in self.authorsCount.items():
            yield authorYear,count
            
    def sum_authors_count_combiner(self,authorYear,count):
        yield authorYear,sum(count)
        
    def sum_authors_count_reducer(self,authorYear,count):
        valueSummed=sum(count)
        if(valueSummed==1):
            yield None,(valueSummed,authorYear)
        
    def maxOne_authors_count(self,key,value):
        for value,authorYear in value:
            authorYear=authorYear.split('->')
            yield 'Author: '+authorYear[0],' Year: '+authorYear[1]
            
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_authors_count,
                  mapper_final=self.final_get_authors_count,
                  combiner=self.sum_authors_count_combiner,
                  reducer=self.sum_authors_count_reducer),
            MRStep(reducer=self.maxOne_authors_count)
        ]
        
if __name__=='__main__':
    CitationsAuthorsPerYearCountMaxOne.run()

Overwriting q6.py


### Q6 GC FILE OUTPUT

In [12]:
!python q6.py citation.txt

"Author: A. Krzyzak"	" Year: 1996"
"Author: Ahmed Hassan"	" Year: 2007"
"Author: Alessandro Aurigi"	" Year: 2005"
"Author: Alex Galis"	" Year: 2006"
"Author: Alexander Gelbukh"	" Year: 2006"
"Author: Alice Redmond-neal"	" Year: 2005"
"Author: Aline Maria Santos Andrade"	" Year: 2005"
"Author: Allan Hunkin"	" Year: 2007"
"Author: Amir Ahmad"	" Year: 2007"
"Author: Amitabh Chaudhary"	" Year: 2006"
"Author: Amitabha Bagchi"	" Year: 2006"
"Author: Andreas N\u00fcrnberger"	" Year: 2004"
"Author: Anita Kesavan"	" Year: 2006"
"Author: Ankur Bhargava"	" Year: 2006"
"Author: Arthur Greef"	" Year: 2006"
"Author: Arto Tapani Juhola"	" Year: 2006"
"Author: Axel Bucker"	" Year: 2005"
"Author: Barry Smyth"	" Year: 2006"
"Author: Bart Preneel"	" Year: 2006"
"Author: Behrouz A. Forouzan"	" Year: 2005"
"Author: Ben Long"	" Year: 2006"
"Author: Brenden Munnelly"	" Year: 2002"
"Author: Bruce Shriver"	" Year: 1975"
"Author: Carla Rose"	" Year: 1994"
"Author: Carlito Vicencio"	" Year: 2005"
"Author: Carlos

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q6.DELL.20240212.172215.925612
Running step 1 of 2...
Running step 2 of 2...
job output is in C:\Users\DELL\AppData\Local\Temp\q6.DELL.20240212.172215.925612\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q6.DELL.20240212.172215.925612\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q6.DELL.20240212.172215.925612...


### 7. Find the title of papers such that their venue is not mentioned in the input file.

In [13]:
%%file q7.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
#Note: I am assuming we have to find both those entries that have #c but nothing follows that and those that dont even have #c.
class PapersWithoutVenue(MRJob):
    def configure_args(self):
        super(PapersWithoutVenue, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternVenue=re.compile(r"(\#c[^\#]*)")
        self.papersWithVenueMissing=[]
    def get_papers_without_venue(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                paperTitle=file[0].split('#@')[0].strip()
                venue=''.join(self.patternVenue.findall(file[0])).replace('#c','').replace('\n','').strip()
                if(len(venue)<=1):
                    self.papersWithVenueMissing.append(paperTitle)

    def final_get_papers_without_venue(self):
        for paper in self.papersWithVenueMissing:
            yield "-> Paper : ",paper
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_papers_without_venue,
                  mapper_final=self.final_get_papers_without_venue)
        ]
        
if __name__=='__main__':
    PapersWithoutVenue.run()

Overwriting q7.py


### Q7 GC FILE OUTPUT

In [14]:
!python q7.py citation.txt

"-> Paper : "	"Automated Deduction in Geometry"
"-> Paper : "	"A+ Certification Core Hardware (Text & Lab Manual)"
"-> Paper : "	"Performance engineering in industry: current practices and adoption challenges"
"-> Paper : "	"Dude, You Can Do It! How to Build a Sweeet PC"
"-> Paper : "	"What Every Programmer Needs to Know about Security (Advances in information Security)"
"-> Paper : "	"Interpreting Kullback-Leibler divergence with the Neyman-earson lemma"
"-> Paper : "	"Digital Media: Transformations in Human Communication"
"-> Paper : "	"TOPP---the OpenMS proteomics pipeline"
"-> Paper : "	"Type Graphics and MacIntosh"
"-> Paper : "	"Adaptive Hypermedia and Adaptive Web-Based Systems"
"-> Paper : "	"Dependable Computing"
"-> Paper : "	"Calculus Early Transcendentals Single Variable"
"-> Paper : "	"Webbots, Spiders, and Screen Scrapers"
"-> Paper : "	"Making the Digital City: The Early Shaping of Urban Internet Space (Design & the Built Environment S.)"
"-> Paper : "	"Linspire 5.0: The

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q7.DELL.20240212.172218.190333
Running step 1 of 1...
job output is in C:\Users\DELL\AppData\Local\Temp\q7.DELL.20240212.172218.190333\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q7.DELL.20240212.172218.190333\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q7.DELL.20240212.172218.190333...


### (BONUS!)  8. Find the title of papers such that their venue IS mentioned in the input file.

In [15]:
%%file q8.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class PapersWithVenue(MRJob):
    def configure_args(self):
        super(PapersWithVenue, self).configure_args()
        self.add_file_arg('--filename', help='Path to the input file') #TO make sure file opens only once
    def init_read_file(self):
        self.patternVenue=re.compile(r"(\#c[^\#]*)")
        self.papersWithVenue={}
    def get_papers_with_venue(self,_,line):
        file=[file for file in line.split('#*') if len(file)>0]
        if (file):
            if("#@" in file[0]):
                paperTitle=file[0].split('#@')[0].strip()
                venue=''.join(self.patternVenue.findall(file[0])).replace('#c','').replace('\n','').strip()
                if(len(venue)>1):
                    self.papersWithVenue[paperTitle]=venue

    def final_get_papers_with_venue(self):
        for paper,venue in self.papersWithVenue.items():
            yield "-> Paper: "+paper,"-> Venue: "+venue
            
    def steps(self):
        return [
            MRStep(
                  mapper_init=self.init_read_file,
                  mapper=self.get_papers_with_venue,
                  mapper_final=self.final_get_papers_with_venue)
        ]
        
if __name__=='__main__':
    PapersWithVenue.run()

Overwriting q8.py


### Q8 GC FILE OUTPUT

In [16]:
!python q8.py citation.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\DELL\AppData\Local\Temp\q8.DELL.20240212.172220.119516
Running step 1 of 1...
job output is in C:\Users\DELL\AppData\Local\Temp\q8.DELL.20240212.172220.119516\output
Streaming final output from C:\Users\DELL\AppData\Local\Temp\q8.DELL.20240212.172220.119516\output...
Removing temp directory C:\Users\DELL\AppData\Local\Temp\q8.DELL.20240212.172220.119516...


"-> Paper: Fast k-NN Classification Rule Using Metrics on Space-Filling Curves"	"-> Venue: Proceedings of the 13th International Conference on Pattern Recognition - Volume 2"
"-> Paper: Approximating fluid schedules in crossbar packet-switches and Banyan networks"	"-> Venue: IEEE\/ACM Transactions on Networking (TON)"
"-> Paper: Modeling methodology b: distributed simulation and the high level architecture"	"-> Venue: Proceedings of the 38th conference on Winter simulation"
"-> Paper: An Integrative Modelling Approach for Simulation and Analysis of Adaptive Agents"	"-> Venue: Proceedings of the 39th annual Symposium on Simulation"
"-> Paper: Notes from industry"	"-> Venue: ACM SIGMICRONewsletter"
"-> Paper: A New Quadtree Decomposition Reconstruction Method"	"-> Venue: Proceedings of the 13th International Conference on Pattern Recognition - Volume 2"
"-> Paper: Microprogramming for the hardware engineer"	"-> Venue: ACM SIGMICRO Newsletter"
"-> Paper: A control word model for detecting

### -> Rough Work (The csv file generated was used to check accuracy of outputs above) 

### FOR GC FILE

In [17]:
import re
import pandas as pd

patterns={
'Paper Title':'#@',
'Authors':re.compile(r"\#\@[^\#\t]*"),
'Year':re.compile(r"(#t[^\#]*)"),
'Publication Venue':re.compile(r"(\#c[^\#]*)"),
'Index ID':re.compile(r"(index.)[^\n]*")
}
citations={
'Paper Title':[],
'Authors':[],
'Year':[],
'Publication Venue':[],
'Index ID':[]
}
fileOpened=open('citation.txt','r')
file=[file for file in fileOpened.read().split('#*') if len(file)>0]
for word in file:
    citations['Paper Title'].append(''.join(word.split(patterns['Paper Title'])[0]).strip())
    citations['Authors'].append(''.join(patterns['Authors'].findall(word)).replace('#@','').replace('\n','').strip())
    citations['Year'].append(int(''.join(patterns['Year'].findall(word)).replace('#t','').replace('\n','')))
    citations['Publication Venue'].append(''.join(patterns['Publication Venue'].findall(word)).replace('#c','').replace('\n','').strip())
    citations['Index ID'].append(int(''.join(patterns['Index ID'].findall(word)).replace('index','')))


citations=pd.DataFrame(citations)
citations.to_csv('Citations.csv')
citations.sample(5)

Unnamed: 0,Paper Title,Authors,Year,Publication Venue,Index ID
79,Keno Winner: A Guide To Winning At Video Keno,Tom Collins,2004,,8
82,The Internet: A Critical Introduction,Korinna Patelis,2007,,8
4,What Every Programmer Needs to Know about Secu...,"Neil Daswani,Anita Kesavan",2006,,4
23,"ASIS&T Thesaurus of Information Science, Techn...","Alice Redmond-neal,Marjorie M. K. Hlava",2005,,2
32,Introduction to Information Systems,R. Kelly Rainer,2007,,3
