In [1]:
import spacy
import pandas as pd

In [2]:
df=pd.read_csv('the_guardian_articles.csv', nrows=100)

In [3]:
df.head()

Unnamed: 0,text,author
0,Federal investigators are examining the first ...,Dan Tynan
1,Apple is reportedly considering a potential ta...,
2,From turning your photos into gifs to riding b...,Stuart Dredge
3,It’s Friday – what a week!,
4,V\nideo games are often criticised for their o...,Keith Stuart


In [4]:
nlp=spacy.load('en')

In [5]:
df['doc']=df.text.apply(lambda x: nlp(x))

In [6]:
length = df['doc'].apply(len)

In [7]:
df['len']=length

In [8]:
df.len.describe()

count     100.000000
mean      859.190000
std       412.826613
min         3.000000
25%       517.750000
50%       844.000000
75%      1087.500000
max      2172.000000
Name: len, dtype: float64

In [9]:
df[df['len']<50]

Unnamed: 0,text,author,doc,len
3,It’s Friday – what a week!,,"(It, ’s, Friday, –, what, a, week, !)",8
61,Chatterbox: Thursday,,"(Chatterbox, :, Thursday)",3


In [15]:
df['ents']=df['doc'].apply(lambda doc: doc.ents)

In [16]:
df['ents']

In [17]:
doc=df['doc'].iloc[0]

In [18]:
[ent.label_ for ent in doc.ents]

['ORDINAL',
 'PRODUCT',
 'PERSON',
 'CARDINAL',
 'GPE',
 'PRODUCT',
 'DATE',
 'ORG',
 'GPE',
 'PERSON',
 'PERSON',
 'PERSON',
 'PRODUCT',
 'ORG',
 'DATE',
 'PRODUCT',
 'ORG',
 'PRODUCT',
 'PRODUCT',
 'PERSON',
 'DATE',
 'PERSON',
 'GPE',
 'PERSON',
 'CARDINAL',
 'PRODUCT',
 'QUANTITY',
 'QUANTITY',
 'ORG',
 'PERSON',
 'DATE',
 'PRODUCT',
 'DATE',
 'PERSON',
 'ORG',
 'PERSON',
 'DATE',
 'PRODUCT']

In [25]:
all_ents=[ent for ents in df['ents'] for ent in ents]

In [184]:
for ent in all_ents:
    ent.merge()

In [28]:
{ent.label_ for ent in all_ents}

{'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [32]:
labels_freq={label: 0 for label in {ent.label_ for ent in all_ents}}

In [35]:
for ent in all_ents:
    labels_freq[ent.label_]+=1

In [36]:
labels_freq

{'CARDINAL': 448,
 'DATE': 676,
 'EVENT': 14,
 'FAC': 83,
 'GPE': 750,
 'LANGUAGE': 6,
 'LAW': 10,
 'LOC': 62,
 'MONEY': 132,
 'NORP': 148,
 'ORDINAL': 138,
 'ORG': 1241,
 'PERCENT': 58,
 'PERSON': 947,
 'PRODUCT': 190,
 'QUANTITY': 52,
 'TIME': 57,
 'WORK_OF_ART': 50}

In [71]:
all_persons=[ent for ent in all_ents if ent.label_=='PERSON']

In [74]:
[person.sent for person in all_persons if 'Trump' in person.text]

[That complacency has been shattered by the revolt of marginalised citizens around the world, angry at “elite” failings and looking to the likes of Donald Trump.,
 When Mark Zuckerberg was asked by his own employees if they should act to prevent certain Donald Trump stories gaining traction his answer was absolutely not.]

In [94]:
all_art=[ent.text for ent in all_ents if ent.label_=='WORK_OF_ART']

In [97]:
set(all_art)

{'Android 1.5 –',
 'Ansip and Jourova',
 'Brought',
 'Corporations Can Make Doing Good an Integral',
 'Encrypting Your Laptop',
 'Film Now',
 'Future Shock',
 'Game Boy',
 'Get Windows 10',
 'Guitar Archive',
 'Harry Potter DVD',
 'Marvel’s Iron Man',
 'Most Corrupt Candidate',
 'Musk told Fortune.\n\n',
 'Nobel',
 'Paid for by”',
 'Playboy',
 'Pokémon',
 'Pokémon Go',
 'Queers Hate Techies',
 'Radar',
 'Scaling at GitHub’ - with a large portion of the problem being the',
 'Space Pavilion',
 'Star Wars',
 'The Associated Press',
 'The Center',
 'The Dyson Demo',
 'The Infinite Monkey Cage',
 'The Managed Heart',
 'The Secret Tesla Motors Master Plan',
 'The “My Eyes Only”',
 'The “Sorry',
 'They Live',
 'Totem Adornments',
 'Wi-Fried',
 'Your Autopilot Has Arrived',
 'a Tesla Model S',
 'the Gold Coast',
 'the Guinness Book of Records',
 'the Live from Jodrell Bank',
 'the Play Store',
 '“Gotta Catch ‘Em All!',
 '“Killing it',
 '“Light Detection and Ranging”',
 '“See More Settings',
 '

In [178]:
def ent_filter(ents, kind):
    return [ent for ent in ents if ent.label_==kind]

In [179]:
df['persons']=df['ents'].apply(lambda ents: ent_filter(ents, 'PERSON'))

In [185]:
df.head()

Unnamed: 0,text,author,doc,len,ents,sentiment,persons
0,Federal investigators are examining the first ...,Dan Tynan,"(Federal, investigators, are, examining, the, ...",445,"((first), (Tesla), (Joshua Brown), (40-year), ...",-0.047798,"[(Joshua Brown), (Brown), (Tesla), (Brown), (E..."
1,Apple is reportedly considering a potential ta...,,"(Apple, is, reportedly, considering, a, potent...",300,"((Apple), (Jay Z’s), (Tidal), (Tidal), (Kanye ...",0.128217,"[(Jay Z’s), (Tidal), (Tidal), (Kanye West), (M..."
2,From turning your photos into gifs to riding b...,Stuart Dredge,"(From, turning, your, photos, into, gifs, to, ...",1511,"((this month), (IAP), (Android), (monthly), (t...",0.19648,"[(Toca Boca), (Netflix), (Rodeo Stampede), (Ka..."
3,It’s Friday – what a week!,,"(It, ’s, Friday, –, what, a week, !)",8,"((Friday), (a week))",0.0,[]
4,V\nideo games are often criticised for their o...,Keith Stuart,"(V, \n, ideo, games, are, often, criticised, f...",775,"((British), (this week), (last minute), (Ubiso...",0.085346,"[(Borgias), (Creed II), (Machiavelli), (Sid Me..."


In [123]:
from textblob.sentiments import PatternAnalyzer

In [124]:
analyzer=PatternAnalyzer()

In [135]:
analyzer.analyze(doc.text).polarity

-0.04779785431959346

In [168]:
doc.sentiment=analyzer.analyze(doc.text).polarity

In [171]:
doc.sentiment

-0.17277777194976807

In [172]:
for doc in df['doc']:
    doc.sentiment=analyzer.analyze(doc.text).polarity

In [174]:
df['sentiment']=df['doc'].apply(lambda d: d.sentiment)

In [186]:
df.head()

Unnamed: 0,text,author,doc,len,ents,sentiment,persons
0,Federal investigators are examining the first ...,Dan Tynan,"(Federal, investigators, are, examining, the, ...",445,"((first), (Tesla), (Joshua Brown), (40-year), ...",-0.047798,"[(Joshua Brown), (Brown), (Tesla), (Brown), (E..."
1,Apple is reportedly considering a potential ta...,,"(Apple, is, reportedly, considering, a, potent...",300,"((Apple), (Jay Z’s), (Tidal), (Tidal), (Kanye ...",0.128217,"[(Jay Z’s), (Tidal), (Tidal), (Kanye West), (M..."
2,From turning your photos into gifs to riding b...,Stuart Dredge,"(From, turning, your, photos, into, gifs, to, ...",1511,"((this month), (IAP), (Android), (monthly), (t...",0.19648,"[(Toca Boca), (Netflix), (Rodeo Stampede), (Ka..."
3,It’s Friday – what a week!,,"(It, ’s, Friday, –, what, a week, !)",8,"((Friday), (a week))",0.0,[]
4,V\nideo games are often criticised for their o...,Keith Stuart,"(V, \n, ideo, games, are, often, criticised, f...",775,"((British), (this week), (last minute), (Ubiso...",0.085346,"[(Borgias), (Creed II), (Machiavelli), (Sid Me..."


In [195]:
jay=df['persons'][1][0]

In [216]:
jay_token=df['persons'][1][0][0]

In [221]:
similarities=[]
for i, person in enumerate(all_persons):
    similarities.append((person, jay_token.similarity(person[0])))

In [227]:
sorted(similarities,key= lambda x: x[1], reverse=True)

#Słabo działa, spróbujemy z większymi wektorami

[(Jay Z’s, 1.0),
 (Jay Z’s, 1.0),
 (Joshua Brown, 0.76865244),
 (Hillary Clinton, 0.75971967),
 (Mark Twain, 0.7441862),
 (Banks, 0.7419728),
 (John Lewis, 0.7366898),
 (Joshua Brown, 0.7233147),
 (Taylor Swift, 0.7210378),
 (Avid Life’s, 0.71220845),
 (Jonathan Ross’s, 0.6951377),
 (Harry Potter, 0.6825176),
 (Madonna, 0.6817763),
 (Levy, 0.67637736),
 (Joshua Brown, 0.6725146),
 (Charles, 0.6723214),
 (Alvin Toffler, 0.66949624),
 (Creed II, 0.66866565),
 (Rob Segal, 0.6670843),
 (Harmony Korine, 0.6658535),
 (Aiden Byrne, 0.65130115),
 (Alex Gibney, 0.65062433),
 (Gibney, 0.64995867),
 (Sebastien Missoffe, 0.64696395),
 (Trak, 0.64406985),
 (Tim Cook, 0.63841516),
 (Tim Cook, 0.6363678),
 (Ray Lakeland, 0.63124907),
 (Toffler, 0.62472504),
 (Arthur Conan Doyle, 0.6245247),
 (Eddie Waring, 0.6232251),
 (Ukip, 0.6231637),
 (Google Photos, 0.6219252),
 (Elon Musk, 0.6197937),
 (Gwyneth Paltrow, 0.61873734),
 (Neil Harbisson, 0.6132424),
 (this Quantified Self, 0.6043565),
 (Lovell, 0.6

In [231]:
scoring=[]
for doc in df['doc']:
    if('Jay Z' in doc.text):
        scoring.append(doc.sentiment)

In [232]:
scoring

[0.12821747362613678, 0.05493183061480522]

## Setting extension to sentence level polarity

In [159]:
from spacy.tokens import Span

In [161]:
def polarity(span):
    return analyzer.analyze(span.text).polarity

In [162]:
Span.set_extension('polarity', getter=polarity)

In [165]:
for sent in doc.sents:
    print(sent._.polarity)

0.04999999999999999
0.0
0.0
-0.4
0.0
-0.3499999999999999
-0.35
-0.2375
-0.2333333333333333
-0.11851851851851852
0.25
0.0
0.0


In [102]:
from textacy.lexicon_methods import emotional_valence

In [119]:
textacy.lexicon_methods.download_depechemood()

In [120]:
emotional_valence(doc)

defaultdict(float,
            {'AFRAID': 0.13200744218749993,
             'AMUSED': 0.14212075588202255,
             'ANGRY': 0.1158086805909092,
             'ANNOYED': 0.11762475423295463,
             'DONT_CARE': 0.11652026924999999,
             'HAPPY': 0.12144892282162159,
             'INSPIRED': 0.1563725598432433,
             'SAD': 0.12797938725543484})