In [57]:
import pandas as pd
from tqdm import tqdm
import os
import nltk
from nltk.corpus import stopwords

In [54]:
news_df = pd.read_csv('../data/news_predictions/news_2015_predictions.csv')
news_df = news_df.drop(['Unnamed: 0', 'parliament', 'top1_acc', 'top1_topic', 'top2_topic', 'top2_acc', 'top3_topic', 'top3_acc'], axis=1)
news_df = news_df.reset_index(drop=True)
news_df['transcript'] = news_df['transcript'].apply(lambda x: str(x).lower())
news_df = news_df.loc[news_df.month >= 6]

In [55]:
news_df2 = pd.read_csv('../data/news_predictions/news_2016_predictions.csv')
news_df2 = news_df2.drop(['Unnamed: 0', 'parliament', 'top1_acc', 'top1_topic', 'top2_topic', 'top2_acc', 'top3_topic', 'top3_acc'], axis=1)
news_df2 = news_df2.reset_index(drop=True)
news_df2['transcript'] = news_df2['transcript'].apply(lambda x: str(x).lower())
news_df2 = news_df2.loc[news_df2.month <= 6]

In [60]:
stopW = stopwords.words('english')

def contains_european(x):
    y = x.lower()
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

In [61]:
news_df = pd.concat([news_df, news_df2])
news_df = news_df.loc[(news_df['year'] == 2016) & (news_df['month'] == 2)]
news_df['eu'] = news_df['transcript'].apply(lambda x: contains_european(x))
news_df = news_df.loc[news_df['eu'] == 'EU']

In [62]:
news_df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,eu
160594,163795,Belfast Telegraph,1,2,2016,JONATHAN BELL: 'Critics who say I'm out of my ...,q how do you see the dup under arlene's leader...,EU
160633,163795,Belfast Telegraph,1,2,2016,Northern Irish public will never be fooled by ...,"timely, too, when it is so wonderful to hear y...",EU
160709,400553,Belfast Telegraph,1,2,2016,"10,000 unaccompanied children unaccounted for ...",the european union police agency announcement ...,EU
160723,400553,Belfast Telegraph,1,2,2016,EU talks: New proposal to be tabled after 'goo...,the european council president said he would p...,EU
160765,400553,Belfast Telegraph,1,2,2016,Martin McGuinness warns Cameron over June date...,a poll is due on a new northern ireland assemb...,EU
...,...,...,...,...,...,...,...,...
318628,412338,Wales,29,2,2016,Five things we learned from Ukip's conference ...,there is a sense of celebration that - at last...,EU
318633,412338,Wales,29,2,2016,"Western Mail letters: Monday, February 29, 201...",keen as he might be to avoid alarming the elec...,EU
318636,412338,Wales,29,2,2016,"Western Mail letters: Saturday, February 27, 2...",cardigan bay is home to some of the uk's bottl...,EU
318638,412338,Wales,29,2,2016,The latest news from around the world The morn...,hollywood star leonardo dicaprio ended years o...,EU


In [63]:
politicians = ['carmichael.neil', 'stewart.jain', 'wallace.ben', 'burns.conor', 'parish.neil', 'newton.sarah']

In [64]:
# count when last name and first name appears together
def count3_new(text, last_name, first_name):
    return text.count(first_name + ' ' + last_name) + text.count(first_name + last_name)

In [69]:
for p in politicians:
    last_name, first_name = p.split('.')
    news_df[p] = news_df['transcript'].apply(lambda x: count3_new(x, last_name, first_name))

In [70]:
news_df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,eu,carmichael.neil,stewart.jain,wallace.ben,burns.conor,parish.neil,newton.sarah
160594,163795,Belfast Telegraph,1,2,2016,JONATHAN BELL: 'Critics who say I'm out of my ...,q how do you see the dup under arlene's leader...,EU,0,0,0,0,0,0
160633,163795,Belfast Telegraph,1,2,2016,Northern Irish public will never be fooled by ...,"timely, too, when it is so wonderful to hear y...",EU,0,0,0,0,0,0
160709,400553,Belfast Telegraph,1,2,2016,"10,000 unaccompanied children unaccounted for ...",the european union police agency announcement ...,EU,0,0,0,0,0,0
160723,400553,Belfast Telegraph,1,2,2016,EU talks: New proposal to be tabled after 'goo...,the european council president said he would p...,EU,0,0,0,0,0,0
160765,400553,Belfast Telegraph,1,2,2016,Martin McGuinness warns Cameron over June date...,a poll is due on a new northern ireland assemb...,EU,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318628,412338,Wales,29,2,2016,Five things we learned from Ukip's conference ...,there is a sense of celebration that - at last...,EU,0,0,0,0,0,0
318633,412338,Wales,29,2,2016,"Western Mail letters: Monday, February 29, 201...",keen as he might be to avoid alarming the elec...,EU,0,0,0,0,0,0
318636,412338,Wales,29,2,2016,"Western Mail letters: Saturday, February 27, 2...",cardigan bay is home to some of the uk's bottl...,EU,0,0,0,0,0,0
318638,412338,Wales,29,2,2016,The latest news from around the world The morn...,hollywood star leonardo dicaprio ended years o...,EU,0,0,0,0,0,0


In [73]:
rows = []
for index, row in news_df.iterrows():
    flag = False
    
    for p in politicians:
        if row[p] != 0:
            flag=True
            break
    if flag:
        r = [row['source_id'], row['source'], row['day'], row['month'], row['year'], row['program_name'], row['transcript'], row['eu']]
        for p in politicians:
            r.append(row[p])
        rows.append(r)
res_df = pd.DataFrame(rows, columns=news_df.columns)

In [77]:
res_df = res_df.loc[res_df['source'] == 'Mail']

In [78]:
res_df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,eu,carmichael.neil,stewart.jain,wallace.ben,burns.conor,parish.neil,newton.sarah
5,397135,Mail,17,2,2016,"Luvvie Emma sneers at Britain: I'm European, s...",she has never been afraid of spouting her lond...,EU,0,0,0,1,0,0
13,397135,Mail,23,2,2016,BBC plans a massive EU referendum TV debate at...,the bbc has booked wembley arena for a massive...,EU,1,0,1,1,1,1
14,397135,Mail,23,2,2016,Tony Blair\nclaims the UK would break up if Br...,tony blair\nwarned today that leaving the eu w...,EU,1,0,1,1,1,1
15,397135,Mail,23,2,2016,David Cameron\nWILL have to resign as Prime Mi...,david cameron\nwould have to quit downing stre...,EU,1,0,1,1,1,1
16,397135,Mail,23,2,2016,Who needs enemies? Cameron praises his 'fantas...,david cameron\ntoday said boris johnson\nwas a...,EU,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,397135,Mail,29,2,2016,Boris Johnson\naccuses David Cameron\nof runni...,boris johnson\ntoday escalated the war of word...,EU,1,0,1,1,1,1
179,397135,Mail,29,2,2016,British tourists could be left STRANDED ABROAD...,british tourists could be left stranded abroad...,EU,1,0,1,1,1,1
180,397135,Mail,29,2,2016,British tourists could be left stranded abroad...,british tourists could be left stranded abroad...,EU,1,0,1,1,1,1
181,397135,Mail,29,2,2016,The Government threatens a DECADE of chaos if ...,ministers today claimed the government was bei...,EU,1,0,1,1,1,1


In [79]:
res_df.to_csv('feb-2016 397135 with_politician_count.csv')