## IRENE UMOH 17100310866

## Website 1 Scraping and Summarization

In [3]:
import requests
from bs4 import BeautifulSoup
from gensim.summarization import summarize

In [4]:
url = 'https://theconversation.com/waffles-and-mochi-why-childrens-food-shows-need-to-focus-on-healthy-eating-157360'

In [5]:
page = requests.get(url).text

In [6]:
soup = BeautifulSoup(page)

In [9]:
head = soup.find('h1').get_text()

In [11]:
p_tags = soup.find_all('p')

In [19]:
texts = [tags.get_text().strip() for tags in p_tags]

In [20]:
print(texts)

['Programme Lead Nutrition and Health, Edge Hill University', 'Hazel Flight does not work for, consult, own shares in or receive funding from any company or organisation that would benefit from this article, and has disclosed no relevant affiliations beyond their academic appointment.', '', 'Edge Hill University provides funding as a member of The Conversation UK.', 'The Conversation is funded by the National Research Foundation, eight universities, including the Cape Peninsula University of Technology, Rhodes University, Stellenbosch University and the Universities of Cape Town, Johannesburg, Kwa-Zulu Natal, Pretoria, and South Africa. It is hosted by the Universities of the Witwatersrand and Western Cape, the African Population and Health Research Centre and the Nigerian Academy of Science. The Bill & Melinda Gates Foundation is a Strategic Partner. more', 'Michelle Obama’s new TV show, Waffles and Mochi, aimed at young children, is a fabulous idea. The idea is to show children where

In [21]:
sentence_list = [sentence for sentence in texts if not '\n' in sentence]

In [22]:
#for sentence in the original document, if it does not contain a new line, return the sentence

In [23]:
sentence_list = [sentence for sentence in sentence_list if '.' in sentence]

In [24]:
sentence_list

['Hazel Flight does not work for, consult, own shares in or receive funding from any company or organisation that would benefit from this article, and has disclosed no relevant affiliations beyond their academic appointment.',
 'Edge Hill University provides funding as a member of The Conversation UK.',
 'The Conversation is funded by the National Research Foundation, eight universities, including the Cape Peninsula University of Technology, Rhodes University, Stellenbosch University and the Universities of Cape Town, Johannesburg, Kwa-Zulu Natal, Pretoria, and South Africa. It is hosted by the Universities of the Witwatersrand and Western Cape, the African Population and Health Research Centre and the Nigerian Academy of Science. The Bill & Melinda Gates Foundation is a Strategic Partner. more',
 'Michelle Obama’s new TV show, Waffles and Mochi, aimed at young children, is a fabulous idea. The idea is to show children where food comes from and some ways of cooking it from scratch – in

In [29]:
article = ''.join(sentence_list)

In [30]:
summary = summarize(article, ratio=0.15)

print('Length of original article: {}'.format(len(article)))
print('Length of summary: {}\n'.format(len(summary)))  
print('Headline: {} \n'.format(head))
print('Article Summary: \n{}'.format(summary))

Length of original article: 5725
Length of summary: 969

Headline: 

            Waffles and Mochi: why children’s food shows need to focus on healthy eating
          
 

Article Summary: 
The idea is to show children where food comes from and some ways of cooking it from scratch – instead of just buying pre-made meals from the supermarket.How a child eats can have a serious impact on their overall health and wellbeing throughout life.
Leading by example, like Mochi and Waffles, is a great way to do this.Teaching children about where their food is from is also brilliant.
A study carried out in the Netherlands found that children who watched television programmes with healthy foods were more likely to choose healthy options for snacks.Waffles and Mochi is an excellent platform, widely available across the world, which could educate children about what foods are good and what their health benefits are, alongside where they come from.
Children are still eating too many calories, includin

In [31]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [32]:
doc = nlp(summary)

## Tokenization

In [33]:
for token in doc:
    print(token.text, '--->', spacy.explain(token.text))

The ---> None
idea ---> None
is ---> None
to ---> None
show ---> None
children ---> None
where ---> None
food ---> None
comes ---> None
from ---> None
and ---> None
some ---> None
ways ---> None
of ---> None
cooking ---> None
it ---> None
from ---> None
scratch ---> None
– ---> None
instead ---> None
of ---> None
just ---> None
buying ---> None
pre ---> None
- ---> None
made ---> None
meals ---> None
from ---> None
the ---> None
supermarket ---> None
. ---> punctuation mark, sentence closer
How ---> None
a ---> None
child ---> None
eats ---> None
can ---> None
have ---> None
a ---> None
serious ---> None
impact ---> None
on ---> None
their ---> None
overall ---> None
health ---> None
and ---> None
wellbeing ---> None
throughout ---> None
life ---> None
. ---> punctuation mark, sentence closer

 ---> None
Leading ---> None
by ---> None
example ---> None
, ---> punctuation mark, comma
like ---> None
Mochi ---> None
and ---> None
Waffles ---> None
, ---> punctuation mark, comma
is ---> No

## Dependency Parsing

In [35]:
for token in doc:
    print(token.text, '--->', token.dep_, '--->', spacy.explain(token.dep))

The ---> det ---> None
idea ---> nsubj ---> None
is ---> ROOT ---> None
to ---> aux ---> None
show ---> xcomp ---> None
children ---> dobj ---> None
where ---> advmod ---> None
food ---> nsubj ---> None
comes ---> relcl ---> None
from ---> prep ---> None
and ---> cc ---> None
some ---> det ---> None
ways ---> conj ---> None
of ---> prep ---> None
cooking ---> pcomp ---> None
it ---> dobj ---> None
from ---> prep ---> None
scratch ---> pobj ---> None
– ---> punct ---> None
instead ---> advmod ---> None
of ---> prep ---> None
just ---> advmod ---> None
buying ---> pcomp ---> None
pre ---> dobj ---> None
- ---> amod ---> None
made ---> amod ---> None
meals ---> dobj ---> None
from ---> prep ---> None
the ---> det ---> None
supermarket ---> pobj ---> None
. ---> punct ---> None
How ---> advmod ---> None
a ---> det ---> None
child ---> nsubj ---> None
eats ---> csubj ---> None
can ---> aux ---> None
have ---> ROOT ---> None
a ---> det ---> None
serious ---> amod ---> None
impact ---> dobj -

## Lemmatization

In [36]:
for token in doc:
    print(token.text, '--->', token.lemma_, '--->', spacy.explain(token.lemma))

The ---> the ---> None
idea ---> idea ---> None
is ---> be ---> None
to ---> to ---> None
show ---> show ---> None
children ---> child ---> None
where ---> where ---> None
food ---> food ---> None
comes ---> come ---> None
from ---> from ---> None
and ---> and ---> None
some ---> some ---> None
ways ---> way ---> None
of ---> of ---> None
cooking ---> cook ---> None
it ---> it ---> None
from ---> from ---> None
scratch ---> scratch ---> None
– ---> – ---> None
instead ---> instead ---> None
of ---> of ---> None
just ---> just ---> None
buying ---> buy ---> None
pre ---> pre ---> None
- ---> - ---> None
made ---> make ---> None
meals ---> meal ---> None
from ---> from ---> None
the ---> the ---> None
supermarket ---> supermarket ---> None
. ---> . ---> None
How ---> how ---> None
a ---> a ---> None
child ---> child ---> None
eats ---> eat ---> None
can ---> can ---> None
have ---> have ---> None
a ---> a ---> None
serious ---> serious ---> None
impact ---> impact ---> None
on ---> on --

## Sentence Boundary Detection

In [37]:
list(doc.sents)

[The idea is to show children where food comes from and some ways of cooking it from scratch – instead of just buying pre-made meals from the supermarket.,
 How a child eats can have a serious impact on their overall health and wellbeing throughout life.,
 Leading by example, like Mochi and Waffles, is a great way to do this.,
 Teaching children about where their food is from is also brilliant.,
 ,
 A study carried out in the Netherlands found that children who watched television programmes with healthy foods were more likely to choose healthy options for snacks.,
 Waffles and Mochi is an excellent platform, widely available across the world, which could educate children about what foods are good and what their health benefits are, alongside where they come from.,
 ,
 Children are still eating too many calories, including sugar, and often the wrong foods and so parents need to be provided with the correct knowledge and information to support their child’s health and wellbeing.]

In [38]:
sentences = list(doc.sents)

In [39]:
for sentence in sentences:
    print(sentence)

The idea is to show children where food comes from and some ways of cooking it from scratch – instead of just buying pre-made meals from the supermarket.
How a child eats can have a serious impact on their overall health and wellbeing throughout life.

Leading by example, like Mochi and Waffles, is a great way to do this.
Teaching children about where their food is from is also brilliant.


A study carried out in the Netherlands found that children who watched television programmes with healthy foods were more likely to choose healthy options for snacks.
Waffles and Mochi is an excellent platform, widely available across the world, which could educate children about what foods are good and what their health benefits are, alongside where they come from.


Children are still eating too many calories, including sugar, and often the wrong foods and so parents need to be provided with the correct knowledge and information to support their child’s health and wellbeing.


## Named Entity

In [40]:
for ent in doc.ents:
    print(ent.text, '---->', ent.label, '---->', spacy.explain(ent.label))

Mochi ----> 384 ----> None
Netherlands ----> 384 ----> None
Mochi ----> 383 ----> None


## Visualization 

In [41]:
from spacy import displacy

In [42]:
displacy.render(doc, style='ent', jupyter=True)

In [43]:
displacy.render(doc, style='dep', jupyter=True)

## Putting it all together

In [44]:
lis = []

In [45]:
for token in doc:
    dic = {}
    dic['Token'] = token.text
    dic['POS'] = token.pos_
    dic['Tags'] = token.tag_
    dic['Dep'] = token.dep_
    dic['Explanation'] = spacy.explain(token.tag_)
    lis.append(dic)

In [46]:
import pandas as pd

In [47]:
data = pd.DataFrame(lis)

In [48]:
data

Unnamed: 0,Token,POS,Tags,Dep,Explanation
0,The,DET,DT,det,determiner
1,idea,NOUN,NN,nsubj,"noun, singular or mass"
2,is,AUX,VBZ,ROOT,"verb, 3rd person singular present"
3,to,PART,TO,aux,"infinitival ""to"""
4,show,VERB,VB,xcomp,"verb, base form"
...,...,...,...,...,...
177,’s,PART,POS,case,possessive ending
178,health,NOUN,NN,dobj,"noun, singular or mass"
179,and,CCONJ,CC,cc,"conjunction, coordinating"
180,wellbeing,NOUN,NN,conj,"noun, singular or mass"


#### You can reindex if you do not like the arrangement

In [49]:
l = []
for ent in doc.ents:
    d = {}
    d['entities'] = ent.text
    d['labels'] = ent.label_
    d['Explanation'] = spacy.explain(ent.label_)
    l.append(d)

In [50]:
entities = pd.DataFrame(l)

In [51]:
entities

Unnamed: 0,entities,labels,Explanation
0,Mochi,GPE,"Countries, cities, states"
1,Netherlands,GPE,"Countries, cities, states"
2,Mochi,ORG,"Companies, agencies, institutions, etc."


In [52]:
entities.to_csv('NLPTasks', index = False)

# Website 2 Scraping and Summarization

In [54]:
import requests
from bs4 import BeautifulSoup
from gensim.summarization import summarize

In [62]:
url2 = 'https://theconversation.com/selfish-or-selfless-human-nature-means-youre-both-155528'

In [63]:
page2 = requests.get(url2).text

In [64]:
soup2 = BeautifulSoup(page2)

In [65]:
head2 = soup2.find('h1').get_text()

In [66]:
p_tags2 = soup2.find_all('p')

In [67]:
texts2 = [tags.get_text().strip() for tags in p_tags2]

In [68]:
print(texts2)

['Postdoctoral Scholar in Social Cognitive Neuroscience, University of Chicago', 'Professor of Psychology, and Psychiatry and Behavioral Neuroscience, University of Chicago', 'The authors do not work for, consult, own shares in or receive funding from any company or organisation that would benefit from this article, and have disclosed no relevant affiliations beyond their academic appointment.', 'The Conversation is funded by the National Research Foundation, eight universities, including the Cape Peninsula University of Technology, Rhodes University, Stellenbosch University and the Universities of Cape Town, Johannesburg, Kwa-Zulu Natal, Pretoria, and South Africa. It is hosted by the Universities of the Witwatersrand and Western Cape, the African Population and Health Research Centre and the Nigerian Academy of Science. The Bill & Melinda Gates Foundation is a Strategic Partner. more', 'Looking out for number one has been important for survival for as long as there have been human be

In [69]:
sentence_list2 = [sentence for sentence in texts2 if not '\n' in sentence]

In [70]:
#for sentence in the original document, if it does not contain a new line, return the sentence

In [71]:
sentence_list2 = [sentence for sentence in sentence_list2 if '.' in sentence]

In [72]:
sentence_list2

['The authors do not work for, consult, own shares in or receive funding from any company or organisation that would benefit from this article, and have disclosed no relevant affiliations beyond their academic appointment.',
 'The Conversation is funded by the National Research Foundation, eight universities, including the Cape Peninsula University of Technology, Rhodes University, Stellenbosch University and the Universities of Cape Town, Johannesburg, Kwa-Zulu Natal, Pretoria, and South Africa. It is hosted by the Universities of the Witwatersrand and Western Cape, the African Population and Health Research Centre and the Nigerian Academy of Science. The Bill & Melinda Gates Foundation is a Strategic Partner. more',
 'Looking out for number one has been important for survival for as long as there have been human beings.',
 'But self-interest isn’t the only trait that helped people win at evolution. Groups of individuals who were predisposed to cooperate, care for each other and uphol

In [73]:
article2 = ''.join(sentence_list2)

In [74]:
summary2 = summarize(article2, ratio=0.15)

print('Length of original article: {}'.format(len(article2)))
print('Length of summary: {}\n'.format(len(summary2)))  
print('Headline: {} \n'.format(head2))
print('Article Summary: \n{}'.format(summary2))

Length of original article: 7882
Length of summary: 2269

Headline: 

            Selfish or selfless? Human nature means you’re both
          
 

Article Summary: 
Together they facilitate cooperation among unrelated individuals, something ubiquitous among people but uncommon in nature.A critical question is how people balance these two motivations when making decisions.We investigate this question in our work at the Social Cognitive Neuroscience Laboratory at the University of Chicago, combining behavioral economics tasks with neuroimaging methods that let us watch what’s happening in the brains of adults and children.
We’ve found evidence that people care about both themselves and others – but it’s the self that takes precedence.Children are sensitive to fairness from a very early age.For instance, if you give two siblings different numbers of cookies, the one who receives fewer will likely throw a fit.
This shift toward equity appears to be universal in humans and follows similar 

In [75]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [76]:
doc = nlp(summary2)

## Tokenization

In [77]:
for token in doc:
    print(token.text, '--->', spacy.explain(token.text))

Together ---> None
they ---> None
facilitate ---> None
cooperation ---> None
among ---> None
unrelated ---> None
individuals ---> None
, ---> punctuation mark, comma
something ---> None
ubiquitous ---> None
among ---> None
people ---> None
but ---> None
uncommon ---> None
in ---> None
nature ---> None
. ---> punctuation mark, sentence closer
A ---> None
critical ---> None
question ---> None
is ---> None
how ---> None
people ---> None
balance ---> None
these ---> None
two ---> None
motivations ---> None
when ---> None
making ---> None
decisions ---> None
. ---> punctuation mark, sentence closer
We ---> None
investigate ---> None
this ---> None
question ---> None
in ---> None
our ---> None
work ---> None
at ---> None
the ---> None
Social ---> None
Cognitive ---> None
Neuroscience ---> None
Laboratory ---> None
at ---> None
the ---> None
University ---> None
of ---> None
Chicago ---> None
, ---> punctuation mark, comma
combining ---> None
behavioral ---> None
economics ---> None
tasks ---

## Dependency Parsing

In [78]:
for token in doc:
    print(token.text, '--->', token.dep_, '--->', spacy.explain(token.dep))

Together ---> advmod ---> None
they ---> nsubj ---> None
facilitate ---> ROOT ---> None
cooperation ---> dobj ---> None
among ---> prep ---> None
unrelated ---> amod ---> None
individuals ---> pobj ---> None
, ---> punct ---> None
something ---> dobj ---> None
ubiquitous ---> amod ---> None
among ---> prep ---> None
people ---> pobj ---> None
but ---> cc ---> None
uncommon ---> conj ---> None
in ---> prep ---> None
nature ---> pobj ---> None
. ---> punct ---> None
A ---> det ---> None
critical ---> amod ---> None
question ---> nsubj ---> None
is ---> ROOT ---> None
how ---> advmod ---> None
people ---> nsubj ---> None
balance ---> ccomp ---> None
these ---> det ---> None
two ---> nummod ---> None
motivations ---> dobj ---> None
when ---> advmod ---> None
making ---> advcl ---> None
decisions ---> dobj ---> None
. ---> punct ---> None
We ---> nsubj ---> None
investigate ---> ROOT ---> None
this ---> det ---> None
question ---> dobj ---> None
in ---> prep ---> None
our ---> poss ---> Non

## Lemmatization

In [79]:
for token in doc:
    print(token.text, '--->', token.lemma_, '--->', spacy.explain(token.lemma))

Together ---> together ---> None
they ---> they ---> None
facilitate ---> facilitate ---> None
cooperation ---> cooperation ---> None
among ---> among ---> None
unrelated ---> unrelated ---> None
individuals ---> individual ---> None
, ---> , ---> None
something ---> something ---> None
ubiquitous ---> ubiquitous ---> None
among ---> among ---> None
people ---> people ---> None
but ---> but ---> None
uncommon ---> uncommon ---> None
in ---> in ---> None
nature ---> nature ---> None
. ---> . ---> None
A ---> a ---> None
critical ---> critical ---> None
question ---> question ---> None
is ---> be ---> None
how ---> how ---> None
people ---> people ---> None
balance ---> balance ---> None
these ---> these ---> None
two ---> two ---> None
motivations ---> motivation ---> None
when ---> when ---> None
making ---> make ---> None
decisions ---> decision ---> None
. ---> . ---> None
We ---> we ---> None
investigate ---> investigate ---> None
this ---> this ---> None
question ---> question --->

## Sentence Boundary Detection

In [80]:
list(doc.sents)

[Together they facilitate cooperation among unrelated individuals, something ubiquitous among people but uncommon in nature.,
 A critical question is how people balance these two motivations when making decisions.,
 We investigate this question in our work at the Social Cognitive Neuroscience Laboratory at the University of Chicago, combining behavioral economics tasks with neuroimaging methods that let us watch what’s happening in the brains of adults and children.,
 ,
 We’ve found evidence that people care about both themselves and others – but it’s the self that takes precedence.,
 Children are sensitive to fairness from a very early age.,
 For instance, if you give two siblings different numbers of cookies, the one who receives fewer will likely throw a fit.,
 ,
 This shift toward equity appears to be universal in humans and follows similar patterns across cultures.,
 Interestingly, it takes several years of development before children’s own behavior catches up with their understan

In [81]:
sentences = list(doc.sents)

In [82]:
for sentence in sentences:
    print(sentence)

Together they facilitate cooperation among unrelated individuals, something ubiquitous among people but uncommon in nature.
A critical question is how people balance these two motivations when making decisions.
We investigate this question in our work at the Social Cognitive Neuroscience Laboratory at the University of Chicago, combining behavioral economics tasks with neuroimaging methods that let us watch what’s happening in the brains of adults and children.


We’ve found evidence that people care about both themselves and others – but it’s the self that takes precedence.
Children are sensitive to fairness from a very early age.
For instance, if you give two siblings different numbers of cookies, the one who receives fewer will likely throw a fit.


This shift toward equity appears to be universal in humans and follows similar patterns across cultures.
Interestingly, it takes several years of development before children’s own behavior catches up with their understanding of fairness 

## Named Entity

In [83]:
for ent in doc.ents:
    print(ent.text, '---->', ent.label, '---->', spacy.explain(ent.label))

two ----> 397 ----> None
the Social Cognitive Neuroscience Laboratory ----> 383 ----> None
the University of Chicago ----> 383 ----> None
two ----> 397 ----> None
several years ----> 391 ----> None
age 4 to 8 ----> 391 ----> None
10 ----> 397 ----> None
two ----> 397 ----> None
400 milliseconds ----> 395 ----> None
7:3 ----> 397 ----> None
5:5 ----> 397 ----> None
EEG ----> 383 ----> None
4-year-old ----> 391 ----> None


## Visualization 

In [84]:
from spacy import displacy

In [85]:
displacy.render(doc, style='ent', jupyter=True)

In [86]:
displacy.render(doc, style='dep', jupyter=True)

## Putting it all together

In [87]:
lis = []

In [88]:
for token in doc:
    dic = {}
    dic['Token'] = token.text
    dic['POS'] = token.pos_
    dic['Tags'] = token.tag_
    dic['Dep'] = token.dep_
    dic['Explanation'] = spacy.explain(token.tag_)
    lis.append(dic)

In [89]:
import pandas as pd

In [90]:
data = pd.DataFrame(lis)

In [91]:
data

Unnamed: 0,Token,POS,Tags,Dep,Explanation
0,Together,ADV,RB,advmod,adverb
1,they,PRON,PRP,nsubj,"pronoun, personal"
2,facilitate,VERB,VBP,ROOT,"verb, non-3rd person singular present"
3,cooperation,NOUN,NN,dobj,"noun, singular or mass"
4,among,ADP,IN,prep,"conjunction, subordinating or preposition"
...,...,...,...,...,...
394,natural,ADJ,JJ,amod,adjective
395,preference,NOUN,NN,pobj,"noun, singular or mass"
396,for,ADP,IN,prep,"conjunction, subordinating or preposition"
397,equality,NOUN,NN,pobj,"noun, singular or mass"


#### You can reindex if you do not like the arrangement

In [92]:
l = []
for ent in doc.ents:
    d = {}
    d['entities'] = ent.text
    d['labels'] = ent.label_
    d['Explanation'] = spacy.explain(ent.label_)
    l.append(d)

In [93]:
entities = pd.DataFrame(l)

In [94]:
entities

Unnamed: 0,entities,labels,Explanation
0,two,CARDINAL,Numerals that do not fall under another type
1,the Social Cognitive Neuroscience Laboratory,ORG,"Companies, agencies, institutions, etc."
2,the University of Chicago,ORG,"Companies, agencies, institutions, etc."
3,two,CARDINAL,Numerals that do not fall under another type
4,several years,DATE,Absolute or relative dates or periods
5,age 4 to 8,DATE,Absolute or relative dates or periods
6,10,CARDINAL,Numerals that do not fall under another type
7,two,CARDINAL,Numerals that do not fall under another type
8,400 milliseconds,QUANTITY,"Measurements, as of weight or distance"
9,7:3,CARDINAL,Numerals that do not fall under another type


In [52]:
entities.to_csv('NLPTasks', index = False)