Chapter 3. Function pipelines for mapping complex transformations
====
### Mastering Large Datasets with Python by JT Wolohan 



### Hacker translation

In [None]:
import re
from toolz.functoolz import pipe, compose

sample_messages = [
"7his所is家4没s4mpl3动m3ss463",
"don7家73ll经4nyon3法7his现m3ss463",
"w3现4r3当b3in6进so好s3cr3t",
"733小h33成h33去nobody看is天on分7o理us",
"w3么will面n3v3r分637理c4u6ht",
"w3事4r3经such没sn34ky天h4ckers"]

In [None]:
def replace_7t(s):
    return s.replace('7', 't')

In [None]:
def replace_3e(s):
    return s.replace('3', 'e')

In [None]:
def replace_6g(s):
    return s.replace('6', 'g')

In [None]:
def replace_4a(s):
    return s.replace('4', 'a')

In [None]:
# Alternative approach
# This function makes functions!
def make_letter_replacer(letter_1, letter_2):
    def replacer(s):
        return s.replace(letter_1, letter_2)
    return replacer

alt_replace_7t = make_letter_replacer('7','t')
alt_replace_7t = make_letter_replacer('3','e')
alt_replace_7t = make_letter_replacer('6','g')
alt_replace_7t = make_letter_replacer('4','a')

In [None]:
class chinese_matcher:
    def __init__(self):
        self.r = re.compile(r'[\u4e00-\u9fff]+')
        
    def sub_chinese(self,s):
        return self.r.sub(" ",s)

In [None]:
C = chinese_matcher()

# Not chained
print(list(
map( C.sub_chinese,
    map(replace_4a,
        map(replace_6g,
            map(replace_3e,
                map(replace_7t, sample_messages)))))),end="\n\n")

In [None]:
# Option 1
hacker_translate = compose(C.sub_chinese, replace_4a, replace_6g,
                           replace_3e, replace_7t)

print(list(map(hacker_translate, sample_messages)),end="\n\n")

In [None]:
# Option 2
def hacker_translate(s):
    return pipe(s, replace_7t, replace_3e, replace_6g,
                   replace_4a, C.sub_chinese)

print(list(map(hacker_translate,sample_messages)),end="\n\n")

### Twitter scraping and gender prediction

In [None]:
from multiprocessing import Pool
from toolz import compose, pipe
import twitter

# Remember to fill in the values below with your own account details
Twitter = twitter.Api(consumer_key="",
                      consumer_secret="",
                      access_token_key="",
                      access_token_secret="")

In [None]:
def get_tweet_from_id(tweet_id, api=Twitter):
    return api.GetStatus(tweet_id, trim_user=True)


def tweet_to_text(tweet):
    return tweet.text


def tokenize_text(text):
    return text.split()

In [None]:
def score_text(tokens):
    words = {"the":1, "to":1, "and":1, #Words with 1 indicate men
             "in":1, "have":1, "it":1,
             "be":-1, "of":-1, "a":-1, # Words with -1 indicate women
             "that":-1, "i":-1, "for":-1}
    return sum(map(lambda x: words.get(x, 0), tokens))


def score_tweet(tweet_id):
    return pipe(tweet_id, get_tweet_from_id, tweet_to_text,
                          tokenize_text, score_text)


def score_user(tweets):
    N = len(tweets)
    total = sum(map(score_tweet, tweets))
    return total/N


In [None]:
def categorize_user(user_score):
    if user_score > 0:
        return {"score":user_score,
                "gender": "Male"}
    return {"score":user_score,
            "gender":"Female"}

In [None]:
users_tweets = [
[1056365937547534341, 1056310126255034368, 1055985345341251584,
1056585873989394432, 1056585871623966720],
[1055986452612419584, 1056318330037002240, 1055957256162942977,
 1056585921154420736, 1056585896898805766],
[1056240773572771841, 1056184836900175874, 1056367465477951490,
 1056585972765224960, 1056585968155684864],
[1056452187897786368, 1056314736546115584, 1055172336062816258,
 1056585983175602176, 1056585980881207297]]
gender_prediction_pipeline = compose(categorize_user, score_user)
with Pool() as P:
    print(P.map(gender_prediction_pipeline, users_tweets))


[Read for more? Go to chapter 4!](./Ch04_notebook.ipynb)