## 0.0 Import modules

In [12]:
## NLP modules
import gensim
from gensim.models.doc2vec import Doc2Vec
import nltk
import textblob
from textblob import TextBlob
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords ##Note you'll need to download NLTK and corpuses
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()

## Other Python modules
import itertools
from operator import itemgetter
import re
import string
import numpy as np
import pandas as pd
import matplotlib
from IPython.display import IFrame

# Graph module
import networkx as nx

## Machine learning & text vectorizer modules
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## 1.0 Functions

### 1.1 Text pre-processing functions

In [13]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()
#     text = str(TextBlob(text).correct())
    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):
    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

## Tokenizer specific for Doc2Vec where each word is important, so stop words are not removed.
def doc2vec_tokenizeText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

### 1.2 Similarity measure functions

In [14]:
def cos_sim(text1,text2):
    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    arrays = tfvectorizer.fit_transform([text1,text2]).A
    num = (arrays[0]*arrays[1]).sum()
    denom1 = np.sqrt((arrays[0]**2).sum())
    denom2 = np.sqrt((arrays[1]**2).sum())
    return num/(denom1*denom2)

def similarity(string1,string2):
    w1 = tokenizeText(cleanText(string1))
    w2 = tokenizeText(cleanText(string2))
    score = 0
    for w in w1:
        if w in w2:
            score += 1
        else:
            continue
    for w in w2:
        if w in w1:
            score += 1
        else:
            continue
    return score/(len(w1)+len(w2))

def lDistance(firstString, secondString):
    "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
    if len(firstString) > len(secondString):
        firstString, secondString = secondString, firstString
    distances = range(len(firstString) + 1)
    for index2, char2 in enumerate(secondString):
        newDistances = [index2 + 1]
        for index1, char1 in enumerate(firstString):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
        distances = newDistances
    return 1/(distances[-1]+1)


### 1.3 Graph building functions

In [32]:
def buildGraph(nodes,weight):
    "nodes - list of hashables that represents the nodes of the graph"
    gr = nx.Graph() #initialize an undirected graph
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))

    #add edges to the graph (weighted by Levenshtein distance)
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        if weight == 'cosine':
            try:
                edge_weight = cos_sim(firstString, secondString)
            except:
                edge_weight = 0
        if weight == 'similarity':
            try:
                edge_weight = similarity(firstString, secondString)
            except:
                edge_weight = 0
        if weight == 'ldistance':
            try:
                edge_weight = lDistance(firstString, secondString)
            except:
                edge_weight = 0
            gr.add_edge(firstString, secondString, weight=edge_weight)
    
    return gr


### 1.4 Summarizer functions

In [16]:

def draw_graph(text,weight='cosine'):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens,weight=weight)
    nx.draw_networkx(graph)

def textrank_summarizer(text,weight='cosine',num_sen = 5, pos_score = False):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens,weight=weight)

    calculated_page_rank = nx.pagerank(graph, weight='weight')
    
    #Create position scorelist
    #Scores are caluclated such that page rank score is increased by 10% if its the first or last sentence
    #Sentences in the middle of the document are not given increased scores
    pos = np.array(list(range(len(sentenceTokens))))
#     score_array = np.array(1+(abs((pos+0.5) - len(pos)/2)/max((pos+0.5) - len(pos)/2)/10))
#     score_dict = {}
#     for i in range(len(sentenceTokens)):
#         score_dict[sentenceTokens[i]] = score_array[i] 

#     #Adjusts page rank score for position
#     score_adj_page_rank = {k : v * score_dict[k] for k, v in calculated_page_rank.items() if k in score_dict}
    
    #most important sentences in ascending order of importance
#     if pos_score == True:
#         sentences = sorted(score_adj_page_rank, key=score_adj_page_rank.get,reverse=True)
#     else:
#         sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True)
  
    #return a word summary
    pos_dict = {}
    for i in range(len(sentenceTokens)):
        pos_dict[sentenceTokens[i]] = pos[i] 

    combined = {k : [v, pos_dict[k]] for k, v in calculated_page_rank.items() if k in pos_dict}

    listlist = []
    for k, v in combined.items():
        listlist.append((k,v[0],v[1]))

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary



def doc2vec_summarizer(text,num_sen=5):

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    LabeledSentence = gensim.models.doc2vec.LabeledSentence

    sentences = doc2vec_tokenizeText(sentenceTokens)

    def labelizeReviews(reviews, label_type):
        labelized = []
        for i,v in enumerate(reviews):
            label = '%s_%s'%(label_type,i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized

    sentences = labelizeReviews(sentences,'train')

    model = Doc2Vec(min_count=1, window=10, size=500, sample=1e-4, workers=8)
    model.build_vocab(sentences)
    for epoch in range(1000):
        model.train(sentences)


    docvec = []
    for i in range(len(model.docvecs)):
        docvec.append(model.docvecs[i])

    kmeans = KMeans(n_clusters=1)

    kmeans.fit(docvec)

    distance = pairwise_distances(kmeans.cluster_centers_, docvec)

    pos = np.array(list(range(len(sentenceTokens))))

    listlist = [list(x) for x in zip(sentenceTokens,distance.tolist()[0],pos)]

    listlist.sort(key=lambda x: x[1],reverse=False)

    ## Sort by sentence order
    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary
        
def lsa_summarizer(text,num_sen=5):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())

    tfvectorizer = TfidfVectorizer(tokenizer=tokenizeText)
    sparse = tfvectorizer.fit_transform(sentenceTokens).A
    lsa = TruncatedSVD(n_components=1)
    concept = lsa.fit_transform(sparse)

    pos = np.array(list(range(len(sentenceTokens))))    
    
    listlist = [list(x) for x in zip(sentenceTokens,concept,pos)]

    listlist.sort(key=lambda x: x[1],reverse=True)

    summarysentences = listlist[0:num_sen]

    summarysentences.sort(key=lambda x: x[2],reverse=False)

    summary = ""
    for n in range(num_sen):
        summary += ' ' + summarysentences[n][0]
        summary = " ".join(summary.replace(u"\xa0", u" ").strip().split())

    return summary

## 2.0 Sample runs

Sample Text

In [37]:
text3 = 'A reader who is not familiar with the subject might reckon that it is the study of the economy. After all, chemistry is the study of chemicals, biology is the study of living things, and sociology is the study of society, so economics must be the study of the economy. But according to some of the most popular economics books of our time, economics is much more than that. According to them, economics is about the Ultimate Question – of ‘Life, the Universe and Everything’ – as in The Hitchhiker’s Guide to the Galaxy, the cult comedy science fiction by Douglas Adams, which was made into a movie in 2005, with Martin ‘The Hobbit’ Freeman in the leading role. According to Tim Harford, the Financial Times journalist and the author of the successful book The Undercover Economist, economics is about Life – he has named his second book The Logic of Life. No economist has yet claimed that economics can explain the Universe. The Universe remains, for now, the turf of physicists, whom most economists have for centuries been looking up to as their role models, in their desire to make their subject a true science.*1 But some economists have come close – they have claimed that economics is about ‘the world’. For example, the subtitle of the second volume in Robert Frank’s popular Economic Naturalist series is How Economics Helps You Make Sense of Your World. Then there is the Everything bit. The subtitle of Logic of Life is Uncovering the New Economics of Everything. According to its subtitle, Freakonomics by Steven Levitt and Stephen Dubner – probably the best-known economics book of our time – is an exploration of the Hidden Side of Everything. Robert Frank agrees, even though he is far more modest in his claim. In the subtitle of his first Economic Naturalist book, he only said Why Economics Explains Almost Everything (emphasis added). So, there we go. Economics is (almost) about Life, the Universe and Everything.*2 When you think about it, this is some claim coming from a subject that has spectacularly failed in what most non-economists think is its main job – that is, explaining the economy. In the run-up to the 2008 financial crisis, the majority of the economics profession was preaching to the world that markets are rarely wrong and that modern economics has found ways to iron out those few wrinkles that markets may have; Robert Lucas, the 1995 winner of the Nobel Prize in Economics,*3 had declared in 2003 that the ‘problem of depression prevention has been solved’.1 So most economists were caught completely by surprise by the 2008 global financial crisis.*4 Not only that, they have not been able to come up with decent solutions to the ongoing aftermaths of that crisis. Given all this, economics seems to suffer from a serious case of megalomania – how can a subject that cannot even manage to explain its own area very well claim to explain (almost) everything? These titles are hyped up. But the point is that they are hyped up in a particular way. The hypes could have been something along the line of ‘how economics explains everything about the economy’, but they are instead along the lines of ‘how economics can explain not just the economy but everything else as well’. The hypes are of this particular variety because of the way in which the currently dominant school of economics, that is, the so-called Neoclassical school, defines economics. The standard Neoclassical definition of economics, the variants of which are still used, is given in the 1932 book by Lionel Robbins, An Essay on the Nature and Significance of Economic Science. In the book, Robbins defined economics as ‘the science which studies human behaviour as a relationship between ends and scarce means which have alternative uses’. In this view, economics is defined by its theoretical approach, rather than its subject matter. Economics is a study of rational choice, that is, choice made on the basis of deliberate, systematic calculation of the maximum extent to which the ends can be met by using the inevitably scarce means. The subject matter of the calculation can be anything – marriage, having children, crime or drug addiction, as Gary Becker, the famous Chicago economist and the winner of 1992 Nobel Prize in Economics, has written about – and not just ‘economic’ issues, as non-economists would define them, such as jobs, money or international trade. When Becker titled his 1976 book The Economic Approach to Human Behaviour, he was really declaring without the hype that economics is about everything. This trend of applying the so-called economic approach to everything, called by its critics ‘economics imperialism’, has reached its apex recently in books like Freakonomics. Little of Freakonomics is actually about economic issues as most people would define them. It talks about Japanese sumo wrestlers, American schoolteachers, Chicago drug gangs, participants in the TV quiz show The Weakest Link, real estate agents and the Ku Klux Klan. Most people would think (and the authors also admit) that none of these people, except real estate agents and drug gangs, have anything to do with economics. But, from the point of view of most economists today, how Japanese sumo wrestlers collude to help each other out or how American schoolteachers fabricate their pupils’ marks to get better job assessments are as legitimate subjects of economics as whether Greece should stay in the Eurozone, how Samsung and Apple fight it out in the smartphone market or how we can reduce youth unemployment in Spain (which is over 55 per cent at the time of writing). To those economists, those ‘economic’ issues do not have privileged status in economics, they are just some of many things (oh, I forgot, some of everything) that economics can explain, because they define their subject in terms of its theoretical approach, rather than its subject matter. The most intuitive answer to most readers may be that the economy is anything to do with money – not having it, earning it, spending it, running out of it, saving it, borrowing it and repaying it. This is not quite right, but it is a good starting point for thinking about the economy – and economics. Now, when we talk of the economy being about money, we are not really talking about physical money. Physical money – be it a banknote, a gold coin or the huge, virtually immovable stones that were used as money in some Pacific islands – is only a symbol. Money is a symbol of what others in your society owe you, or your claim on particular amounts of the society’s resources.2 How money and other financial claims – such as company shares, derivatives and many complex financial products, which I will explain in later chapters – are created, sold and bought is one huge area of economics, called financial economics. These days, given the dominance of the financial industry in many countries, a lot of people equate economics with financial economics, but it is actually only a small part of economics. Your money – or the claims you have over resources – may be generated in a number of different ways. And a lot of economics is (or should be) about those. The most common way to get money – unless you have been born into it – is to have a job (including being your own boss) and earn money from it. So, a lot of economics is about jobs. We can reflect on jobs from different perspectives. Jobs can be understood from the point of view of the individual worker. Whether you get a job and how much you are paid for it depends on the skills you have and how many demands there are for them. You may get very high wages because you have very rare skills, like Cristiano Ronaldo, the football player. You may lose your job (or become unemployed) because someone invents a machine that can do what you do 100 times faster – as happened to Mr Bucket, Charlie’s father, a toothpaste cap-screwer, in the 2005 movie version of Roald Dahl’s Charlie and the Chocolate Factory.*5 Or you have to accept lower wages or worse working conditions because your company is losing money thanks to cheaper imports from, say, China. And so on. So, in order to understand jobs even at the individual level, we need to know about skills, technological innovation and international trade. Wages and working conditions are also deeply affected by ‘political’ decisions to change the very scope and the characteristics of the labour market (I have put ‘political’ in quotation marks, as in the end the boundary between economics and politics is blurry, but that is a topic for later – see Chapter 11). The accession of the Eastern European countries to the European Union has had huge impacts on the wages and behaviours of Western European workers, by suddenly expanding the supply of workers in their labour markets. The restriction on child labour in the late nineteenth century and early twentieth centuries had the opposite effect of shrinking the boundary of the labour market – suddenly a large proportion of the potential employees were shut out of the labour market. Regulations on working hours, working conditions and minimum wages are examples of less dramatic ‘political’ decisions that affect our jobs. In addition to holding down a job, you can get money through transfers – that is, by simply being given it. This can be either in the form of cash or ‘in kind’, that is, direct provision of particular goods (e.g., food) or services (e.g., primary education). Whether in cash or in kind, these transfers can be made in a number of different ways. There are transfers made by ‘people you know’. Examples include parental support for children, people taking care of elderly family members, gifts from local community members, say, for your daughter’s wedding. Then there is charitable giving, that is, transfer voluntarily made to strangers. People – sometimes individually sometimes collectively (e.g., through corporations or voluntary associations) – give to charities that help others. In terms of its quantity, charitable giving is overshadowed in many multiples by transfers made through governments, which tax some people to subsidize others. So a lot of economics is naturally about these things – or the areas of economics known as public economics. Even in very poor countries, there are some government schemes to give cash or goods in kind (e.g., free grains) to those who are in the worst positions (e.g., the aged, the disabled, the starving). But the richer societies, especially those in Europe, have transfer schemes that are much more comprehensive in scope and generous in amounts. This is known as the welfare state and is based on progressive taxation (those who earn more paying proportionally larger shares of their incomes in taxes) and universal benefits (where everyone, not just the poorest or the disabled, is entitled to a minimum income and to basic services, such as health care and education). Once you gain access to resources, whether through jobs or transfers, you consume them. As physical beings, we need to consume some minimum amount of food, clothes, energy, housing, and other goods to fulfil our basic needs. And then we consume other goods for ‘higher’ mental wants – books, musical instruments, exercise equipment, TV, computers and so on. We also buy and consume services – a bus ride, a haircut, a dinner at a restaurant or even a holiday abroad.3 So a lot of economics is devoted to the study of consumption – how people allocate money between different types of goods and services, how they make choices between competing varieties of the same product, how they are manipulated and/or informed by advertisements, how companies spend money to build their ‘brand images’ and so on. In order to be consumed, these goods and services have to be produced in the first place – goods in farms and factories and services in offices and shops. This is the realm of production – an area of economics that has been rather neglected since the Neoclassical school, which puts emphasis on exchange and consumption, became dominant in the 1960s. In standard economics textbooks, production appears as a ‘black box’, in which somehow quantities of labour (work by humans) and capital (machines and tools) are combined to produce the goods and services. There is little recognition that production is a lot more than combining some abstract quanta called labour and capital and involves getting many ‘nitty-gritty’ things right. And these are things that most readers may not normally have associated with economics, despite their crucial importance for the economy: how the factory is physically organized, how to control the workers or deal with trade unions, how to systematically improve the technologies used through research. Most economists are very happy to leave the study of these things to ‘other people’ – engineers and business managers. But, when you think about it, production is the ultimate foundation of any economy. Indeed, the changes in the sphere of production usually have been the most powerful sources of social change. Our modern world has been made by the series of changes in technologies and institutions relating to the sphere of production that have been made since the Industrial Revolution. The economics profession, and the rest of us whose views of the economy are informed by it, need to pay far more attention to production than currently. Defining economics in this way makes this book unlike most other economics books in one fundamental way. As they define economics in terms of its methodology, most economics books assume that there is only one right way of ‘doing economics’ – that is, the Neoclassical approach. The worst examples won’t even tell you that there are other schools of economics than the Neoclassical one. By defining economics in terms of the subject matter, this book highlights the fact that there are many different ways of doing economics, each with its emphases, blind spots, strengths and weaknesses. After all, what we want from economics is the best possible explanation of various economic phenomena rather than a constant ‘proof’ that a particular economic theory can explain not just the economy but everything.'

In [8]:
text2 = '\n\nBRUSSELS – The “Brexiteers” – those who want Britain to leave the European Union – argue that their goal would be virtually cost-free and have no effect on the United Kingdom’s global trade. They are wrong. On June 23, when voters in Britain cast their ballots in the referendum on the question, they need to consider what is actually involved in leaving the EU – and how the free-trade benefits they now enjoy (and take for granted) could be maintained after Brexit. Start with the basics. Leaving the EU means that the UK would exit the EU’s Customs Union, which is the basis for cross-border free trade among the EU’s 28 members (and establishes a common external tariff vis-à-vis third countries). It also means exit from the Single Market – the basis for the free movement of goods and services among EU members. By definition, non-members of the EU cannot belong to the Single Market.\n\nWill Brexit Destroy Britain and Europe?\n\nPhilippe Legrain weighs the views of Joschka Fischer, Richard Haass, Joseph Nye, and others on what Britain’s withdrawal from the EU would mean for both sides.\n\nSo what would happen next? During the two-year period before Britain’s withdrawal takes final effect, there would be UK-EU negotiations on many points – sovereignty, the legal order, immigration, finances, and economic matters. The assumption is that a crucial goal for Britain would be to negotiate a trading relationship as close as possible to the free-trade relationships that exist today.That is easier said than done. The best result would be if all players agreed to maintain the free trade already achieved, with the UK setting a new external tariff on a duty-free basis, applicable to all comers. This is what happened in the 1970s after Britain and Denmark left the European Free Trade Association: Free-trade agreements were negotiated among EFTA members and between them and the EU (or the EEC as it was then known). But Brexiteers should realize that there is no guarantee that this would happen again – and, in any case, there would be complications. While this solution would be good for the 45% of British exports that are sold in EU markets, it would reduce protection for British industries to zero. Under the rules of the World Trade Organization, the same import duties must be applied to all WTO participants – which means that if Britain’s imports from the EU are duty-free, its imports from the rest of the world must be, too. The alternative would be for UK exporters to accept the EU’s common external tariff, and for the UK to create its own import tariff, applied to all imports, including from the EU. Because the common tariff is at a relatively low level on industrial and fishery products, this might not be an insuperable barrier for UK exports, and it would allow some flexibility in protecting UK companies from imports. The potential pitfall is that any British tariff increase above the EU level would expose the UK to claims for compensation from third countries in the WTO. The bigger question that the Brexiteers need to answer is how to secure a high level of access to the EU’s internal market. This is vitally important for Britain’s service industries, particularly for the City of London’s exports of financial services. There is only one precedent for non-EU members being able to negotiate access to the internal market equivalent to that enjoyed by EU members. This is the European Economic Area agreement concluded with the EU by Norway, Iceland, and Liechtenstein in 1992.In the view of many observers, including me, access to the Single Market through the EEA is no longer available. But what if we’re wrong? The point is that such a deal would go against all the instincts (and rhetoric) of the Brexiteers, because it would mean accepting the EU’s “four freedoms”: not just the free movement of goods, services, and capital – but of people, too. That would be hard to square with the Brexit objective of “controlling our borders.” The Brexiteers would also blanch at Britain’s obligation under an EEA-type agreement to continue contributing to the EU budget.Of course, there could be specific arrangements for particular sectors. But it seems unlikely that such arrangements would be possible in financial services and the major professional services (including doctors, architects, and lawyers), which are important for Britain’s competitors in Europe. Indeed, it is possible that the EU would adopt a tough mercantilist stance: If you want privileged access, you should stay in the club. The final consequence of Brexit is that the UK would lose its free-trade arrangements with third countries under the many trade agreements that the EU has signed since 2000. Replacing these agreements with bilateral deals would take time. There is no guarantee that the EU would agree to an interim continuation of free trade, and it seems certain that UK exports would face higher tariffs than its former EU partners in those third countries (placing British exporters at a competitive disadvantage).\n\nIt’s a big stretch for the Brexiteers to ask us to believe that the world’s most prominent leaders, European or not, have all got the impact of Brexit wrong. In the trade debate, we have had the International Monetary Fund, US President Barack Obama, and the OECD – quite apart from the UK Treasury – telling Britons that a vote to leave would be bad for the economy. It is not much of a reply to argue that the OECD is “in the pay of the EU” or that Obama is anti-British because he had a Kenyan father in British colonial days.Economic forecasting is an uncertain science. But when almost all projections point in the same direction – that Brexit would be hugely damaging to the UK – it is time to decide what is credible and what is not.\n\n'

In [43]:
textrank_summarizer(text3,weight='cosine',num_sen = 20)

'After all, chemistry is the study of chemicals, biology is the study of living things, and sociology is the study of society, so economics must be the study of the economy. Robert Frank agrees, even though he is far more modest in his claim. In the subtitle of his first Economic Naturalist book, he only said Why Economics Explains Almost Everything (emphasis added). So, there we go. The hypes could have been something along the line of ‘how economics explains everything about the economy’, but they are instead along the lines of ‘how economics can explain not just the economy but everything else as well’. In this view, economics is defined by its theoretical approach, rather than its subject matter. But, from the point of view of most economists today, how Japanese sumo wrestlers collude to help each other out or how American schoolteachers fabricate their pupils’ marks to get better job assessments are as legitimate subjects of economics as whether Greece should stay in the Eurozone,

In [44]:
lsa_summarizer(text3,num_sen=25)

'But according to some of the most popular economics books of our time, economics is much more than that. According to Tim Harford, the Financial Times journalist and the author of the successful book The Undercover Economist, economics is about Life – he has named his second book The Logic of Life. No economist has yet claimed that economics can explain the Universe. *1 But some economists have come close – they have claimed that economics is about ‘the world’. According to its subtitle, Freakonomics by Steven Levitt and Stephen Dubner – probably the best-known economics book of our time – is an exploration of the Hidden Side of Everything. In the subtitle of his first Economic Naturalist book, he only said Why Economics Explains Almost Everything (emphasis added). Economics is (almost) about Life, the Universe and Everything. *2 When you think about it, this is some claim coming from a subject that has spectacularly failed in what most non-economists think is its main job – that is, 

In [49]:
doc2vec_summarizer(text3,num_sen=15)

'According to them, economics is about the Ultimate Question – of ‘Life, the Universe and Everything’ – as in The Hitchhiker’s Guide to the Galaxy, the cult comedy science fiction by Douglas Adams, which was made into a movie in 2005, with Martin ‘The Hobbit’ Freeman in the leading role. In the run-up to the 2008 financial crisis, the majority of the economics profession was preaching to the world that markets are rarely wrong and that modern economics has found ways to iron out those few wrinkles that markets may have; Robert Lucas, the 1995 winner of the Nobel Prize in Economics,*3 had declared in 2003 that the ‘problem of depression prevention has been solved’.1 So most economists were caught completely by surprise by the 2008 global financial crisis. The subject matter of the calculation can be anything – marriage, having children, crime or drug addiction, as Gary Becker, the famous Chicago economist and the winner of 1992 Nobel Prize in Economics, has written about – and not just 

In [25]:
from gensim import summarization

In [48]:
summarization.summarize(text3,ratio=0.1).replace('\n',' ')

'Economics is (almost) about Life, the Universe and Everything.*2 When you think about it, this is some claim coming from a subject that has spectacularly failed in what most non-economists think is its main job – that is, explaining the economy. In the book, Robbins defined economics as ‘the science which studies human behaviour as a relationship between ends and scarce means which have alternative uses’. The subject matter of the calculation can be anything – marriage, having children, crime or drug addiction, as Gary Becker, the famous Chicago economist and the winner of 1992 Nobel Prize in Economics, has written about – and not just ‘economic’ issues, as non-economists would define them, such as jobs, money or international trade. But, from the point of view of most economists today, how Japanese sumo wrestlers collude to help each other out or how American schoolteachers fabricate their pupils’ marks to get better job assessments are as legitimate subjects of economics as whether 

In [19]:
## Still needs help in addressing unique characters. Latex has trouble interpreting certain special characters

In [24]:
from pylatex import Document, Section, Subsection, Tabular, Math, TikZ, Axis, Figure, Package, Command
from pylatex.utils import italic, NoEscape
import os

def print_article(article_num):
    doc = Document()
    doc.generate_tex()
    summary = lsa_summarizer(ps_df.loc[article_num,'articles'])
    
    doc.preamble.append(Command('title',ps_df.loc[article_num,'titles']))
    doc.preamble.append(Command('author', ps_df.loc[article_num,'author']))
    doc.preamble.append(Command('date',ps_df.loc[article_num,'date_published'].title()))
    doc.append(NoEscape(r'\maketitle'))

    with doc.create(Section('Summary')):
        doc.append(summary)

    with doc.create(Section('Article')):
        doc.append(ps_df.loc[article_num,'articles'].replace('\n','\n\n'))

    doc.generate_pdf(ps_df.loc[article_num,'titles'],compiler='pdflatex')

## 3.0 Generate PDFs of Project Syndicate Articles (or whatever articles)

In [25]:
ps_df = pd.read_pickle('ps_df.pkl')

In [26]:
ps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 5 columns):
titles            315 non-null object
author            315 non-null object
date_published    315 non-null object
articles          315 non-null object
topic             315 non-null object
dtypes: object(5)
memory usage: 12.4+ KB


In [27]:
ps_df.sample()

Unnamed: 0,titles,author,date_published,articles,topic
162,Republicans Ride the Trump Tiger,Theda Skocpol,"MAY 30, 2016","CAMBRIDGE – During a typical week in late May,...",Politics


In [28]:
article_num = 300
print_article(article_num)
from IPython.display import IFrame
IFrame(ps_df.loc[article_num,'titles']+'.pdf', width=1000, height=500)

In [50]:
pweh = 'According to them, economics is about the Ultimate Question – of ‘Life, the Universe and Everything’ – as in The Hitchhiker’s Guide to the Galaxy, the cult comedy science fiction by Douglas Adams, which was made into a movie in 2005, with Martin ‘The Hobbit’ Freeman in the leading role. *2 When you think about it, this is some claim coming from a subject that has spectacularly failed in what most non-economists think is its main job – that is, explaining the economy. In the run-up to the 2008 financial crisis, the majority of the economics profession was preaching to the world that markets are rarely wrong and that modern economics has found ways to iron out those few wrinkles that markets may have; Robert Lucas, the 1995 winner of the Nobel Prize in Economics,*3 had declared in 2003 that the ‘problem of depression prevention has been solved’.1 So most economists were caught completely by surprise by the 2008 global financial crisis. The subject matter of the calculation can be anything – marriage, having children, crime or drug addiction, as Gary Becker, the famous Chicago economist and the winner of 1992 Nobel Prize in Economics, has written about – and not just ‘economic’ issues, as non-economists would define them, such as jobs, money or international trade. But, from the point of view of most economists today, how Japanese sumo wrestlers collude to help each other out or how American schoolteachers fabricate their pupils’ marks to get better job assessments are as legitimate subjects of economics as whether Greece should stay in the Eurozone, how Samsung and Apple fight it out in the smartphone market or how we can reduce youth unemployment in Spain (which is over 55 per cent at the time of writing). Money is a symbol of what others in your society owe you, or your claim on particular amounts of the society’s resources.2 How money and other financial claims – such as company shares, derivatives and many complex financial products, which I will explain in later chapters – are created, sold and bought is one huge area of economics, called financial economics. You may get very high wages because you have very rare skills, like Cristiano Ronaldo, the football player. You may lose your job (or become unemployed) because someone invents a machine that can do what you do 100 times faster – as happened to Mr Bucket, Charlie’s father, a toothpaste cap-screwer, in the 2005 movie version of Roald Dahl’s Charlie and the Chocolate Factory. *5 Or you have to accept lower wages or worse working conditions because your company is losing money thanks to cheaper imports from, say, China. Wages and working conditions are also deeply affected by ‘political’ decisions to change the very scope and the characteristics of the labour market (I have put ‘political’ in quotation marks, as in the end the boundary between economics and politics is blurry, but that is a topic for later – see Chapter 11). The restriction on child labour in the late nineteenth century and early twentieth centuries had the opposite effect of shrinking the boundary of the labour market – suddenly a large proportion of the potential employees were shut out of the labour market. This can be either in the form of cash or ‘in kind’, that is, direct provision of particular goods (e.g., food) or services (e.g., primary education). This is known as the welfare state and is based on progressive taxation (those who earn more paying proportionally larger shares of their incomes in taxes) and universal benefits (where everyone, not just the poorest or the disabled, is entitled to a minimum income and to basic services, such as health care and education). We also buy and consume services – a bus ride, a haircut, a dinner at a restaurant or even a holiday abroad.3 So a lot of economics is devoted to the study of consumption – how people allocate money between different types of goods and services, how they make choices between competing varieties of the same product, how they are manipulated and/or informed by advertisements, how companies spend money to build their ‘brand images’ and so on. And these are things that most readers may not normally have associated with economics, despite their crucial importance for the economy: how the factory is physically organized, how to control the workers or deal with trade unions, how to systematically improve the technologies used through research.'

In [55]:
pweh.split('.')

['According to them, economics is about the Ultimate Question – of ‘Life, the Universe and Everything’ – as in The Hitchhiker’s Guide to the Galaxy, the cult comedy science fiction by Douglas Adams, which was made into a movie in 2005, with Martin ‘The Hobbit’ Freeman in the leading role',
 ' *2 When you think about it, this is some claim coming from a subject that has spectacularly failed in what most non-economists think is its main job – that is, explaining the economy',
 ' In the run-up to the 2008 financial crisis, the majority of the economics profession was preaching to the world that markets are rarely wrong and that modern economics has found ways to iron out those few wrinkles that markets may have; Robert Lucas, the 1995 winner of the Nobel Prize in Economics,*3 had declared in 2003 that the ‘problem of depression prevention has been solved’',
 '1 So most economists were caught completely by surprise by the 2008 global financial crisis',
 ' The subject matter of the calculat