In [1]:
import pandas as pd
import codecs
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter

In [3]:
newspapers = pd.read_csv("C:/Users/jw156/Ironhack/vaccine/vaccine_sentiment/english/newspaper_df.csv", index_col=0)

In [4]:
newspapers.columns

Index(['title', 'href', 'date', 'new_title', 'id'], dtype='object')

In [6]:
newspapers.set_index("href", inplace=True)

In [7]:
newspapers = newspapers.reset_index(drop=True)

In [8]:
newspapers.title.drop_duplicates(inplace=True)

In [9]:
newspapers

Unnamed: 0,title,date,new_title,id
0,telegraph,2021-03-24,coronaviru latest news joint euuk statement in...,global-health
1,telegraph,2021-03-24,europ tighten covid restrict third wave take hold,politics
2,telegraph,2021-03-24,pub may demand covid passport pull pint,politics
3,telegraph,2021-03-24,bori johnson warn EU vaccin ban target UK backfir,news
4,telegraph,2021-03-24,later flow covid test inaccur use mass screen ...,news
...,...,...,...,...
161768,guardian,2020-04-28,"franc edg brazil world cup one thousand, nine ...",sport
161769,guardian,2020-07-11,england west indi first test day four happen,sport
161770,guardian,2020-03-09,contain viru extrem unlik work say bori johnso...,politics
161771,guardian,2020-03-12,trump say US suspend travel europ coronaviru s...,us-news


# POS Determination of words

In [10]:
newspapers = newspapers.reset_index(drop=True)

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer 

In [12]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize


In [13]:
from nltk.corpus import wordnet as wn

def pos_headline(headline):
    new_headline = headline.split(' ')
    word_list = []
    for word in new_headline:
        if word == None or word.isalpha()==False:
            continue
        else:
            text = word_tokenize(word)
            postition_tag = nltk.pos_tag(text)
            word_list.append(postition_tag)
    return word_list

def is_proper_noun(tag):
    return tag in ['NNP', 'NNPS']

def is_noun(tag):
    return tag in ['NN', 'NNS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']



def penn_to_wn(headlines):
    headline_list = pos_headline(headlines)
    tagged_list = []
    for tag in headline_list:
        tag = list(tag[0])
        if is_adjective(tag[1]):
            tag[1] = (wn.ADJ)
            tagged_list.append(tag)
        elif is_noun(tag[1]):
            tag[1] =(wn.NOUN)
            tagged_list.append(tag)
        elif is_adverb(tag[1]):
            tag[1] =(wn.ADV)
            tagged_list.append(tag)
        elif is_verb(tag[1]):
            tag[1] =(wn.VERB)
            tagged_list.append(tag)
        elif is_proper_noun(tag[1]):
            tag[1] =('NNP')
            tagged_list.append(tag)
    return tagged_list

def lemming_words(headline):
    if len(headline) == 0:
        return 0
    headline_list = penn_to_wn(headline)    
    new_string = []
    lemmatizer = WordNetLemmatizer()
    for word in headline_list:
        if word[1] == 'NNP':
            continue
        else:
            word[0] = (lemmatizer.lemmatize(word[0], word[1]))
            new_string.append(word)

    return new_string

In [15]:
x = newspapers.new_title[1]
lemming_words(x)


[['europ', 'n'],
 ['tighten', 'n'],
 ['covid', 'n'],
 ['restrict', 'n'],
 ['third', 'a'],
 ['wave', 'n'],
 ['take', 'v'],
 ['hold', 'n']]

In [None]:
newspapers['pos_headline'] = newspapers['new_title'].apply(pos_headline)

In [None]:
newspapers['lem_pos_headline'] = newspapers['new_title'].apply(lambda x: lemming_words(x))

In [None]:
newspapers['pos_headline'][1]

# Word Sense Disambiguation

In [15]:
from nltk.wsd import lesk

x = 'The porn threatens to cause a lockdown'

print(lesk(x.split(), 'threatens', 'v'))


Synset('threaten.v.03')


In [16]:
def word_definitions(title, lemmed_title):
    synsets = []
    original_text = title
    lemmed_pos = lemmed_title
    for i in lemmed_pos:
        synsets.append(lesk(original_text.split(), i[0], i[1]))
    return synsets

In [17]:
x = newspapers['title'][1]
y = newspapers['lem_pos_headline'][1]
import re
from nltk.corpus import sentiwordnet as swn,SentiSynset

j = word_definitions(x, y)
k = str(j[0])
print(k)
x = (re.findall(r"'([^']*)'", k))
print(x)
f = swn.senti_synset(x[0])
print(f)

Synset('agnosticism.n.02')
['agnosticism.n.02']
<agnosticism.n.02: PosScore=0.25 NegScore=0.0>


In [18]:
newspapers['word_definitions'] = newspapers.apply(lambda x: word_definitions(x['title'], x['lem_pos_headline']), axis=1)


In [19]:
x = newspapers['word_definitions'][1]

In [20]:
for i in x:
    syn_word = (re.findall(r"'([^']*)'", str(i)))
    if len(syn_word) != 0:
        input_word = (syn_word[0])
        word_senti = (swn.senti_synset(input_word))
        pos = float(word_senti.pos_score()) 
        neg = float(word_senti.neg_score())
        overall_senti = pos + neg
        print(overall_senti)

0.25
0.25
0.0
0.0
0.0


## Save dataframe

In [46]:
# newspapers.to_csv('C:/Users/jw156/Ironhack/vaccine/newspaper_pos_score.csv')
# print("Sucessfully saved")

Sucessfully saved


## Load dataframe

In [2]:
newspapers = pd.read_csv('C:/Users/jw156/Ironhack/vaccine/newspaper_pos_score.csv')
newspapers.set_index("hrefs", inplace=True)

In [3]:
newspapers

Unnamed: 0_level_0,title,new_title,id,date,newspaper_name,vaccine,pos_headline,lem_pos_headline,word_definitions
hrefs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://www.theguardian.com/politics/2021/mar/19/boris-johnson-receives-first-dose-of-oxfordastrazenca-covid-vaccine,Boris Johnson receives Oxford/AstraZeneca Covi...,bor johnson receiv oxfordastrazenec covid vaccin,politics,2021-03-19,guardian,,"[[('Boris', 'NNP')], [('Johnson', 'NNP')], [('...","[['receives', 'n'], ['Covid', 'n'], ['vaccine'...","[None, None, Synset('vaccine.n.01')]"
https://www.theguardian.com/world/2021/feb/19/german-politicians-counter-astrazeneca-covid-vaccine-scepticism-with-show-of-support,Scepticism over Oxford vaccine threatens Europ...,sceiv oxford vaccin threatens europ immun push,world,2021-02-19,guardian,,"[[('Scepticism', 'NN')], [('over', 'IN')], [('...","[['Scepticism', 'n'], ['Oxford', 'n'], ['vacci...","[Synset('agnosticism.n.02'), Synset('oxford.n...."
https://www.theguardian.com/world/2021/mar/18/thursday-briefing-eus-experts-to-give-oxford-vaccine-verdict,Thursday briefing: EU's experts to give Oxford...,thursday brief eu expert giv oxford vaccin ver...,world,2021-03-18,guardian,,"[[('Thursday', 'NNP')], [('experts', 'NNS')], ...","[['expert', 'n'], ['give', 'v'], ['Oxford', 'n...","[Synset('expert.n.01'), Synset('give.v.18'), S..."
https://www.theguardian.com/world/2021/mar/16/oxford-astrazeneca-vaccine-update-pausing-jabs,Oxford/AstraZeneca vaccine: which countries ha...,oxfordastrazenec vaccin country paus jab,world,2021-03-16,guardian,,"[[('which', 'WDT')], [('countries', 'NNS')], [...","[['country', 'n'], ['have', 'v'], ['pause', 'v...","[Synset('country.n.04'), Synset('take.v.35'), ..."
https://www.theguardian.com/world/2021/mar/16/chaos-in-germany-and-italy-after-suspension-of-oxford-vaccine,Chaos in Germany and Italy after suspension of...,chao germany ita suspend oxford vaccin,world,2021-03-16,guardian,,"[[('Chaos', 'NN')], [('in', 'IN')], [('Germany...","[['Chaos', 'n'], ['suspension', 'n'], ['Oxford...","[Synset('chaos.n.02'), Synset('suspension.n.05..."
...,...,...,...,...,...,...,...,...,...
https://www.independent.co.uk/news/uk/politics/lockdown-law-extension-coronavirus-act-b1822637.html,Extreme lockdown laws extended for a further s...,extrem lockdown law extend six month despit ma...,coronavirus,2021-03-25,independent,,"[[('Extreme', 'NN')], [('lockdown', 'NN')], [(...","[['Extreme', 'n'], ['lockdown', 'n'], ['law', ...","[Synset('extreme_point.n.01'), Synset('lockdow..."
https://www.independent.co.uk/news/uk/politics/lockdown-law-extension-coronavirus-act-b1822511.html,Extreme lockdown laws extended for a further s...,extrem lockdown law extend six month despit ma...,coronavirus,2021-03-25,independent,,"[[('Extreme', 'NN')], [('lockdown', 'NN')], [(...","[['Extreme', 'n'], ['lockdown', 'n'], ['law', ...","[Synset('extreme_point.n.01'), Synset('lockdow..."
https://www.independent.co.uk/independentpremium/business/covid-vaccine-pubs-boris-johnson-b1822508.html,The problem with Johnson’s vaccine passports f...,the problem johnson vaccin passport pub,business,2021-03-25,independent,,"[[('The', 'DT')], [('problem', 'NN')], [('with...","[['problem', 'n'], ['vaccine', 'n'], ['passpor...","[Synset('problem.n.02'), Synset('vaccine.n.01'..."
https://www.independent.co.uk/voices/editorials/vaccine-wars-eu-uk-macron-merkel-astrazeneca-b1822566.html,Reality is dawning on the EU: export bans are ...,real dawn eu export ban counterproduc cas covi...,uk,2021-03-25,independent,,"[[('Reality', 'NN')], [('is', 'VBZ')], [('dawn...","[['Reality', 'n'], ['be', 'v'], ['dawn', 'v'],...","[Synset('reality.n.03'), Synset('embody.v.02')..."


# TF − IDF ∗ SentimentScore

In [4]:
newspapers

Unnamed: 0_level_0,title,new_title,id,date,newspaper_name,vaccine,pos_headline,lem_pos_headline,word_definitions
hrefs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://www.theguardian.com/politics/2021/mar/19/boris-johnson-receives-first-dose-of-oxfordastrazenca-covid-vaccine,Boris Johnson receives Oxford/AstraZeneca Covi...,bor johnson receiv oxfordastrazenec covid vaccin,politics,2021-03-19,guardian,,"[[('Boris', 'NNP')], [('Johnson', 'NNP')], [('...","[['receives', 'n'], ['Covid', 'n'], ['vaccine'...","[None, None, Synset('vaccine.n.01')]"
https://www.theguardian.com/world/2021/feb/19/german-politicians-counter-astrazeneca-covid-vaccine-scepticism-with-show-of-support,Scepticism over Oxford vaccine threatens Europ...,sceiv oxford vaccin threatens europ immun push,world,2021-02-19,guardian,,"[[('Scepticism', 'NN')], [('over', 'IN')], [('...","[['Scepticism', 'n'], ['Oxford', 'n'], ['vacci...","[Synset('agnosticism.n.02'), Synset('oxford.n...."
https://www.theguardian.com/world/2021/mar/18/thursday-briefing-eus-experts-to-give-oxford-vaccine-verdict,Thursday briefing: EU's experts to give Oxford...,thursday brief eu expert giv oxford vaccin ver...,world,2021-03-18,guardian,,"[[('Thursday', 'NNP')], [('experts', 'NNS')], ...","[['expert', 'n'], ['give', 'v'], ['Oxford', 'n...","[Synset('expert.n.01'), Synset('give.v.18'), S..."
https://www.theguardian.com/world/2021/mar/16/oxford-astrazeneca-vaccine-update-pausing-jabs,Oxford/AstraZeneca vaccine: which countries ha...,oxfordastrazenec vaccin country paus jab,world,2021-03-16,guardian,,"[[('which', 'WDT')], [('countries', 'NNS')], [...","[['country', 'n'], ['have', 'v'], ['pause', 'v...","[Synset('country.n.04'), Synset('take.v.35'), ..."
https://www.theguardian.com/world/2021/mar/16/chaos-in-germany-and-italy-after-suspension-of-oxford-vaccine,Chaos in Germany and Italy after suspension of...,chao germany ita suspend oxford vaccin,world,2021-03-16,guardian,,"[[('Chaos', 'NN')], [('in', 'IN')], [('Germany...","[['Chaos', 'n'], ['suspension', 'n'], ['Oxford...","[Synset('chaos.n.02'), Synset('suspension.n.05..."
...,...,...,...,...,...,...,...,...,...
https://www.independent.co.uk/news/uk/politics/lockdown-law-extension-coronavirus-act-b1822637.html,Extreme lockdown laws extended for a further s...,extrem lockdown law extend six month despit ma...,coronavirus,2021-03-25,independent,,"[[('Extreme', 'NN')], [('lockdown', 'NN')], [(...","[['Extreme', 'n'], ['lockdown', 'n'], ['law', ...","[Synset('extreme_point.n.01'), Synset('lockdow..."
https://www.independent.co.uk/news/uk/politics/lockdown-law-extension-coronavirus-act-b1822511.html,Extreme lockdown laws extended for a further s...,extrem lockdown law extend six month despit ma...,coronavirus,2021-03-25,independent,,"[[('Extreme', 'NN')], [('lockdown', 'NN')], [(...","[['Extreme', 'n'], ['lockdown', 'n'], ['law', ...","[Synset('extreme_point.n.01'), Synset('lockdow..."
https://www.independent.co.uk/independentpremium/business/covid-vaccine-pubs-boris-johnson-b1822508.html,The problem with Johnson’s vaccine passports f...,the problem johnson vaccin passport pub,business,2021-03-25,independent,,"[[('The', 'DT')], [('problem', 'NN')], [('with...","[['problem', 'n'], ['vaccine', 'n'], ['passpor...","[Synset('problem.n.02'), Synset('vaccine.n.01'..."
https://www.independent.co.uk/voices/editorials/vaccine-wars-eu-uk-macron-merkel-astrazeneca-b1822566.html,Reality is dawning on the EU: export bans are ...,real dawn eu export ban counterproduc cas covi...,uk,2021-03-25,independent,,"[[('Reality', 'NN')], [('is', 'VBZ')], [('dawn...","[['Reality', 'n'], ['be', 'v'], ['dawn', 'v'],...","[Synset('reality.n.03'), Synset('embody.v.02')..."


In [45]:
N = len (newspapers)
processed_text = []

for i in newspapers.title[:N]:
    processed_text.append(i)

In [46]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    tokens = tokens.split(' ')
    for w in tokens:
        w = w.lower()
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [47]:
total_vocab_size = len(DF)

In [48]:
total_vocab_size

45991

In [49]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [55]:
doc_freq('sceptic')

3

In [51]:
doc = 0
tf_idf = {}
for i in range(N):
    headline = processed_text[i]
    new_headline = headline.split(' ')
    counter = Counter(new_headline)
    words_count = len(new_headline)
    for token in np.unique(new_headline):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        tf_idf[doc, token] = tf*idf
    doc += 1

In [52]:
tf_idf

{(0, 'Boris'): 1.8392154680933335,
 (0, 'Covid'): 1.8392154680933335,
 (0, 'Johnson'): 1.8392154680933335,
 (0, 'Oxford/AstraZeneca'): 1.8392154680933335,
 (0, 'receives'): 1.2615928176267124,
 (0, 'vaccine'): 0.5289276217893368,
 (1, "Europe's"): 1.3794116010700002,
 (1, 'Oxford'): 1.3794116010700002,
 (1, 'Scepticism'): 1.3794116010700002,
 (1, 'immunisation'): 1.0687982698465002,
 (1, 'over'): 0.39123139493746445,
 (1, 'push'): 0.786295085024594,
 (1, 'threatens'): 0.7367036565071677,
 (1, 'vaccine'): 0.3966957163420026,
 (2, "EU's"): 1.2261436453955556,
 (2, 'Oxford'): 1.2261436453955556,
 (2, 'Thursday'): 1.2261436453955556,
 (2, 'briefing:'): 0.5034449594470395,
 (2, 'experts'): 0.5172411412955819,
 (2, 'give'): 0.5927610261299239,
 (2, 'to'): 0.10687707992295561,
 (2, 'vaccine'): 0.35261841452622456,
 (2, 'verdict'): 0.9252491786064213,
 (3, 'Oxford/AstraZeneca'): 1.2261436453955556,
 (3, 'and'): 0.21277628436531112,
 (3, 'countries'): 0.5495831036792396,
 (3, 'have'): 0.3750198

In [12]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [13]:
DF['i']

1445

In [14]:
processed_text = ['when harry met sally', 'I like big noses']
N = len(processed_text)
from collections import Counter

doc = 0
tf_idf = {}
for i in range(N):
    headline = processed_text[i]
    new_headline = headline.split(' ')
    counter = Counter(new_headline)
    words_count = len(new_headline)
    for token in np.unique(new_headline):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        tf_idf[doc, token] = tf*idf
    doc += 1 

In [15]:
N

2

In [16]:
index_df=list(df_model2.index)


NameError: name 'df_model2' is not defined

In [17]:
N = len (df_model2)

sum_sentiment={}

for index in index_df:
    headline = processed_text[index]
    new_headline = headline.split(' ')
    #print(new_headline)
    for token in np.unique(new_headline):
        #print(index)
        #print(token)
        if index in list(sum_sentiment.keys()):
            sentiment_value=float(tf_idf[(index, token)])*float(tf_idf[(index, token)])
            sum_sentiment[index] += sentiment_value
        else:
            sum_sentiment[index]=float(tf_idf[(index, token)])*float(tf_idf[(index, token)])

NameError: name 'df_model2' is not defined

In [None]:
sum_sentiment

In [None]:
df_model2["sentiment"]=df_model2.apply(lambda row: sum_sentiment[row.name],axis=1)

In [None]:
df_model2