In [29]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

In [30]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [31]:
stopW = stopwords.words('english')

def contains_european(x):
    y = x.lower()
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

In [22]:
year = 2016
exclude = '_no_Others'

In [23]:
df = pd.read_csv('../data/news_predictions/news_{}_predictions.csv'.format(year))
df = df.drop(['Unnamed: 0'], axis=1)
df  = df.dropna(subset=['transcript'])
df = df.reset_index(drop=True)

In [24]:
ids = [389195, 138620, 397135, 8200]

In [25]:
df = df.loc[df['source_id'].isin(ids)]

In [26]:
df = df.loc[df.year == 2016]

In [27]:
df = df.loc[df.month <= 6]
df = df.loc[df.month >= 2]

In [32]:
df['EU'] = df['transcript'].apply(lambda x: contains_european(x))

In [33]:
df = df.loc[df['EU'] == 'EU']

In [34]:
df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,EU
161532,138620,Guardian,1,2,2016,Cameron told he has two weeks to persuade Pole...,The prime minister has been warned by Whitehal...,1,European Union,41.16,International affairs,22.18,"Agriculture, animals, food and rural affairs",10.46,EU
161547,138620,Guardian,1,2,2016,Europe's refugee story has hardly begun;\nWith...,Germany itself will face critical choices: if ...,1,International affairs,58.76,European Union,13.21,"Parliament, government and politics",10.25,EU
161573,138620,Guardian,1,2,2016,Ofcom chief reduced to writing in FT to air vi...,"Still, at least White is doing her best to cra...",1,"Business, industry and consumers",36.20,Science and technology,21.02,Economy and finance,15.73,EU
161577,138620,Guardian,1,2,2016,David Miliband\ncalls for 1m work permits for ...,Speaking before a London conference on Syria h...,1,International affairs,88.22,"Asylum, immigration and nationality",3.33,Others,1.21,EU
161589,138620,Guardian,1,2,2016,Donald Tusk to table EU reform proposals after...,The former prime minister of Poland said he ha...,1,European Union,48.91,International affairs,23.56,"Parliament, government and politics",6.87,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969990,389195,Telegraph,30,6,2016,Jeremy Corbyn ignores calls from David Cameron...,Angela Eagle is preparing to fight Jeremy Corb...,1,European Union,47.53,"Parliament, government and politics",21.46,International affairs,11.69,EU
969994,389195,Telegraph,30,6,2016,Angela Eagle to launch Labour leadership bid a...,Tom Watson on Wednesday night ruled himself ou...,1,European Union,47.19,"Parliament, government and politics",22.35,International affairs,10.97,EU
969995,389195,Telegraph,30,6,2016,Labour shadow cabinet and ministers resignatio...,Here are their resignation letters in full. Pa...,1,"Parliament, government and politics",72.23,"Crime, civil law, justice and rights",5.83,European Union,4.01,EU
969999,389195,Telegraph,30,6,2016,'Labour's Brexit crisis: Now it's civil war' -...,Tom Watson on Wednesday night ruled himself ou...,1,International affairs,49.03,European Union,36.75,"Parliament, government and politics",3.78,EU


In [35]:
rows = []
for index, row in df.iterrows():
    rows.append([row['source'], row['day'], row['month'], row['year'], row['program_name']])
res_df = pd.DataFrame(rows, columns=['source', 'day', 'month', 'year', 'program_name'])
res_df

Unnamed: 0,source,day,month,year,program_name
0,Guardian,1,2,2016,Cameron told he has two weeks to persuade Pole...
1,Guardian,1,2,2016,Europe's refugee story has hardly begun;\nWith...
2,Guardian,1,2,2016,Ofcom chief reduced to writing in FT to air vi...
3,Guardian,1,2,2016,David Miliband\ncalls for 1m work permits for ...
4,Guardian,1,2,2016,Donald Tusk to table EU reform proposals after...
...,...,...,...,...,...
22220,Telegraph,30,6,2016,Jeremy Corbyn ignores calls from David Cameron...
22221,Telegraph,30,6,2016,Angela Eagle to launch Labour leadership bid a...
22222,Telegraph,30,6,2016,Labour shadow cabinet and ministers resignatio...
22223,Telegraph,30,6,2016,'Labour's Brexit crisis: Now it's civil war' -...


In [36]:
res_df.to_csv('news_headlines_EU_Feb_June_2016.csv', index=False)