In [20]:
#!/usr/bin/env python

import sys
import os
import string
import nltk

SEP = '\t'
vowels = ['a', 'e', 'i', 'o', 'u']

INPUT = ['It was the best of times, it was the worst of times,',
         'it was the age of ooowisdom. it was the age of foolishness,',
         'it was the epochssss of belief, it was the epoch of incredulity,'] 

class Mapper(object):
    
    def __init__(self, stream=INPUT, sep = SEP):
        self.stream = stream
        self.sep = SEP
        self.stop_words = nltk.corpus.stopwords.words('english')
        self.punctuation = string.punctuation 
               
    def __iter__(self):
        for row in self.stream:
            yield row.strip()
            
    def normalize(self, token):
        return token.lower()
    
    def exclude(self, token):
        return (token in self.stop_words or token in self.punctuation)
    
    def word_check(self, token):
        return (token[:1] in vowels and len(token) > 6)
    
    def tokenize(self, line):
        for token in nltk.wordpunct_tokenize(line):
            token = self.normalize(token)
            if not self.exclude(token) and self.word_check(token):
                yield token
    
    def emit(self, key, value):
        #sys.stdout.write('{0}{1}{2}\n'.format(key,self.sep,value))
        print('{0}{1}{2}\n'.format(key,self.sep,value))   # for testing in jupyter only
        
    def mapping(self):
        for line in self:
            for key in self.tokenize(line):
                #file_path = os.getenv('map_input_file')
                #file_name = file_path.strip().split('/')[-1]
                file_name = 'comedies'                    # for testing in jupyter only 
                self.emit(key, file_name)         


if __name__ == '__main__':
    mapper = Mapper()
    mapper.mapping()


ooowisdom	comedies

epochssss	comedies

incredulity	comedies



In [22]:
file_path = os.getenv('map_input_file')
file_name = file_path.split('/')[-1]

AttributeError: 'NoneType' object has no attribute 'split'

In [17]:
temp = 'how are you doing'
last = temp.split()[-1]
last

'doing'

In [15]:
#!/usr/bin/env python

import sys
from itertools import groupby
from operator import itemgetter

SEP = '\t'

INPUT = ['word1\tlove','word1\tcomedy','word1\thorror',
         'word2\tromance','word2\tcomedy','word2\thorror','word2\tgeneric','word2\taction','word2\tcomedy',
         'word4\tlove','word4\tcomedy','word4\thorror'] 

class Reducer(object):
    
    def __init__(self, stream=INPUT, sep = SEP):
        self.stream = stream
        self.sep = SEP 
               
    def __iter__(self):
        generator = (line.strip().split(self.sep, 1) for line in self.stream)
        for item in groupby(generator, itemgetter(0)):
            yield item
        
    def emit(self, key, value):
        #sys.stdout.write("'{0}'{1}{2}\n".format(key,self.sep,value))
        print("'{0}'{1}{2}\n".format(key, self.sep, value))
        
    def reduce(self):
        for key, group in self:
            values = set()
            for item in group: 
                values.add(item[1].strip())
                if len(values) == 5:
                    break             
            self.emit(key, list(values))        
            
if __name__ == '__main__':
    reducer = Reducer()
    reducer.reduce()


'word1'	['comedy', 'love', 'horror']

'word2'	['generic', 'romance', 'horror', 'action', 'comedy']

'word4'	['comedy', 'love', 'horror']



In [25]:
def check(l):
    parts = l.split()
    return parts[0], parts[1]  ## it will be returned as a tuple if multiple items are returned

test = check('test string')
print(test, type(test))

('test', 'string') <class 'tuple'>


In [49]:
import itertools 

L = [("a", 1), ("a", 2), ("b", 3), ("b", 4)] 

# Key function 
key_func = lambda x: x[0] 

for group in itertools.groupby(L, key_func): 
    print(group)
    print(group[0], list(group[1]))

('a', <itertools._grouper object at 0x0000019809BB49C8>)
a [('a', 1), ('a', 2)]
('b', <itertools._grouper object at 0x0000019809BEEF48>)
b [('b', 3), ('b', 4)]


In [100]:
input = [('a', (('a', 'comedy'), ('a', 'joke'),('a', 'love'),('a', 'history'),('a', 'horror'),('a', 'comedy'))),
         ('b', (('b', 'comedy'), ('b', 'joke'),('b', 'love'),('b', 'history')))]

for key, group in input:
    values = set()
    for item in group: 
        values.add(item[1])
        if len(values) == 5:
            print(key, item, len(values))
            break
    print("'{}'\t{}".format(key, list(values))) 

a ('a', 'horror') 5
'a'	['horror', 'history', 'comedy', 'love', 'joke']
'b'	['love', 'history', 'comedy', 'joke']


In [89]:
input = [('a', (('a', 'comedy'), ('a', 'comedy'),('a', 'history'),('a', 'history'))),
         ('b', (('b', 'comedy'), ('b', 'history'),('b', 'comedy'),('b', 'history')))]

values = [list(set([item[1] for item in group])) for key, group in input]
print(values)

[['history', 'comedy'], ['history', 'comedy']]


In [14]:
#!/usr/bin/env python

import sys
import os
import string

SEP = '\t'
vowels = ['a', 'e', 'i', 'o', 'u']
STOP_WORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
              "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
              'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 
              'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 
              'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
              'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
              'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 
              'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 
              'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
              'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 
              'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 
              "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 
              'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', 
              "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', 
              "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

INPUT = ['It was the best of times, it was the worst of times,',
         'it was the age of ooowisdom. it was the age of foolishness,',
         'it was the epochssss of belief, it was the epoch of incredulity,'] 

class Mapper(object):
    
    def __init__(self, stream = INPUT, sep = SEP):
        self.stream = stream
        self.sep = SEP
        self.stop_words = STOP_WORDS 
        self.exclude_chars = string.digits + string.punctuation
               
    def __iter__(self):
        for row in self.stream:
            yield row.strip()
    
    def remove_punct_dig(self, line):
        table = str.maketrans(self.exclude_chars, len(self.exclude_chars) * ' ') # in python 2.x its string. instead of str
        return line.translate(table)
    
    def normalize(self, token):
        return token.lower()
    
    def exclude(self, token): 
        return (token in self.stop_words)
    
    def word_check(self, token):
        return (token[:1] in vowels and len(token) > 6)
    
    def preprocess(self, line):
        line = self.remove_punct_dig(line)
        for token in line.split():
            token = self.normalize(token)
            if not self.exclude(token) and self.word_check(token):
                yield token
    
    def emit(self, key, value):
        #sys.stdout.write('{0}{1}{2}\n'.format(key,self.sep,value))
        print('{0}{1}{2}\n'.format(key,self.sep,value))   # for testing in jupyter only
        
    def mapping(self):
        for line in self:
            for key in self.preprocess(line):
                #file_path = os.getenv('map_input_file')
                #file_name = file_path.strip().split('/')[-1]
                file_name = 'comedies'                    # for testing in jupyter only 
                self.emit(key, file_name)         

if __name__ == '__main__':
    mapper = Mapper()
    mapper.mapping()

ooowisdom	comedies

epochssss	comedies

incredulity	comedies



In [3]:
#!/usr/bin/env python

import sys
from itertools import groupby
from operator import itemgetter
from collections import Counter

SEP = '\t'

INPUT = ['word1\tlove','word1\tcomedy','word1\thorror',
         'word2\tromance','word2\tcomedy','word2\thorror','word2\tgeneric','word2\taction','word2\tcomedy',
         'word4\tlove','word4\tcomedy','word4\thorror'] 

class Reducer2(object):
    
    def __init__(self, stream = INPUT, sep = SEP):
        self.stream = stream
        self.sep = SEP 
               
    def __iter__(self):
        generator = (line.strip().split(self.sep, 1) for line in self.stream)
        for item in groupby(generator, itemgetter(0)):
            yield item
        
    def emit(self, key, value):
        #sys.stdout.write("'{0}'{1}{2}\n".format(key,self.sep,value))
        print("'{0}'{1}{2}\n".format(key, self.sep, value))
        
    def reduce(self):
        for key, group in self:
            values = []
            for item in group: 
                values.append(item[1])
            counts = Counter(values)
            self.emit(key, dict(counts))        
            
if __name__ == '__main__':
    reducer = Reducer2()
    reducer.reduce()

'word1'	{'love': 1, 'comedy': 1, 'horror': 1}

'word2'	{'romance': 1, 'comedy': 2, 'horror': 1, 'generic': 1, 'action': 1}

'word4'	{'love': 1, 'comedy': 1, 'horror': 1}



In [14]:
#!/usr/bin/env python

import sys
from itertools import groupby
from operator import itemgetter

SEP = '\t'

INPUT = ['word1\tlove','word1\tcomedy','word1\thorror',
         'word2\tromance','word2\tcomedy','word2\thorror','word2\tgeneric','word2\taction','word2\tcomedy',
         'word4\tlove','word4\tcomedy','word4\thorror'] 

class Reducer2(object):
    
    def __init__(self, stream = INPUT, sep = SEP):
        self.stream = stream
        self.sep = SEP 
               
    def __iter__(self):
        generator = (line.strip().split(self.sep, 1) for line in self.stream)
        for item in groupby(generator, itemgetter(0)):
            yield item
        
    def emit(self, key, value):
        #sys.stdout.write("'{0}'{1}{2}\n".format(key,self.sep,value))
        print("'{0}'{1}{2}\n".format(key, self.sep, value))
        
    def reduce(self):
        for key, group in self:
            files = [item[1] for item in group]
            file_counts = ((file, files.count(file)) for file in set(files))
            self.emit(key, dict(file_counts))        
            
if __name__ == '__main__':
    reducer = Reducer2()
    reducer.reduce()

'word1'	{'comedy': 1, 'horror': 1, 'love': 1}

'word2'	{'generic': 1, 'comedy': 2, 'action': 1, 'horror': 1, 'romance': 1}

'word4'	{'comedy': 1, 'horror': 1, 'love': 1}



In [8]:
a = ['romance', 'comedy', 'horror', 'generic', 'action', 'comedy']

file_wordcount = {}
for item in a:
    if item not in file_wordcount:
        file_wordcount[item] = 1
    else: 
        file_wordcount[item] += 1

file_wordcount

{'romance': 1, 'comedy': 2, 'horror': 1, 'generic': 1, 'action': 1}

In [10]:
counts = dict((item, a.count(item)) for item in set(a))
counts

{'generic': 1, 'comedy': 2, 'action': 1, 'horror': 1, 'romance': 1}

In [12]:
counts = ((item, a.count(item)) for item in set(a))
counts

<generator object <genexpr> at 0x000002D924FDCA48>

In [13]:
dict(counts)

{'generic': 1, 'comedy': 2, 'action': 1, 'horror': 1, 'romance': 1}