In [59]:
import pandas as pd
from pathlib import Path
import ast
import os
from datetime import date
from nltk.corpus import stopwords
import nltk
import calendar

In [60]:
stopW = stopwords.words('english')

def contains_european(x):
    y = x.lower()
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

In [61]:
year = 2016

In [68]:
news_df = pd.read_csv('../data/news_predictions/news_{}_predictions.csv'.format(year))
news_df['top1_topic'] = news_df.apply(lambda x: "_".join(x.top1_topic.replace(',', '').split(" ")), axis=1)
# news_df = news_df.loc[news_df.top1_acc >= 40]
# news_df = news_df.loc[news_df.top1_topic != 'Others']
news_df['Date'] = news_df.apply(lambda x: date(x['year'], x['month'], x['day']).strftime('%B %d, %Y'), axis=1)
news_df = news_df.drop(['Unnamed: 0', 'day', 'month', 'year', 'parliament', 'top1_acc', 'top2_topic', 'top2_acc', 'top3_topic', 'top3_acc'], axis=1)
news_df.columns = ['source_id', 'Source', 'Program Name', 'Transcript', 'topic', 'Date']
news_df = news_df.dropna(subset=['Transcript'])
news_df['eu'] = news_df['Transcript'].apply(lambda x: contains_european(x))

In [69]:
news_df

Unnamed: 0,source_id,Source,Program Name,Transcript,topic,Date,eu
0,163795,Belfast Telegraph,My pride and joy;\nTrimble delight at record 2...,Having already become Ulster's most-capped pla...,Parliament_government_and_politics,"January 01, 2016",non_EU
1,163795,Belfast Telegraph,SWEDE TALKER;\nIf you don't recognise Alicia V...,"""It's always there,'' says the rising star, of...",Others,"January 01, 2016",non_EU
2,163795,Belfast Telegraph,Ricky Warwick & Damon Johnson,"Most of Warwick's time is spent on the road, e...",Business_industry_and_consumers,"January 01, 2016",non_EU
3,163795,Belfast Telegraph,PICK OF THE WEEK,The five-piece play a multitude of instruments...,Culture_media_and_sport,"January 01, 2016",non_EU
4,163795,Belfast Telegraph,Folk royalty are on the one road to a great show,The multi-platinum selling High Kings - Finbar...,Parliament_government_and_politics,"January 01, 2016",non_EU
...,...,...,...,...,...,...,...
1863167,412338,Wales,New Year's Eve rail delays as services disrupt...,Arriva Trains Wales services are being further...,Transport,"December 31, 2016",non_EU
1863168,412338,Wales,Has 2016 been all bad? Of course not! And here...,It's the year Britain was torn in two by Brexi...,Culture_media_and_sport,"December 31, 2016",non_EU
1863169,412338,Wales,Advice for driving in fog: when to use your fo...,Fog can significantly impact driving condition...,Transport,"December 31, 2016",non_EU
1863170,412338,Wales,Driver left with 'serious' injuries following ...,"The single-vehicle road traffic collision, inv...",Others,"December 31, 2016",non_EU


In [74]:
df_test = news_df.loc[news_df.source_id == 138620]
df_test = df_test.loc[df_test.Date.str.contains('September')]
len(df_test)

4713

In [70]:
id_to_name_map = {
    400553: 'Belfast Telegraph',
    377101: 'The Scotsman',
    418973: 'Daily Record',
    244365: 'Wales on Sunday',
    8200: 'Independent',
    412338: 'Wales Online',
    138794: 'Mail on Sunday',
    232241: 'Sunday Express',
    334988: 'Sunday Telegraph',
    331369: 'Sunday Sun',
    138620: 'Guardian',
    419001: 'Mirror',
    8010: 'Guardian Weekly',
    142728: 'The Herald',
    408506: 'Express Online',
    143296: 'The Observer',
    363952: 'Daily Star Sunday',
    145251: 'The People',
    232240: 'The Express',
    145253: 'Daily Record and Sunday Mail',
    389195: 'Telegraph',
    145254: 'Daily and Sunday Mirror',
    344305: 'Scotland on Sunday',
    8109: 'The Daily Telegraph',
    397135: 'Mail',
    163795: 'Belfast Telegraph 2',
    412334: 'Daily Post',
    408508: 'Daily Star',
    411938: 'London Evening Standard'
}

news_ids = list(id_to_name_map.keys())

topics_index_to_name_map = {
    0: 'Agriculture_animals_food_and_rural_affairs',
    1: 'Asylum_immigration_and_nationality',
    2: 'Business_industry_and_consumers',
    3: 'Communities_and_families',
    4: 'Crime_civil_law_justice_and_rights',
    5: 'Culture_media_and_sport',
    6: 'Defence',
    7: 'Economy_and_finance',
    8: 'Education',
    9: 'Employment_and_training',
    10: 'Energy_and_environment',
    11: 'European_Union',
    12: 'Health_services_and_medicine',
    13: 'Housing_and_planning',
    14: 'International_affairs',
    15: 'Parliament_government_and_politics',
    16: 'Science_and_technology',
    17: 'Social_security_and_pensions',
    18: 'Social_services',
    19: 'Transport',
    20: 'Others'
}

topics = list(topics_index_to_name_map.values())

In [None]:
counts = pd.crosstab(news_df.Source, news_df.topic)
counts

In [None]:
counts.to_csv('news_{}_topics_counts_with_no_Others.csv'.format(year))

# Topic Split

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_short_name = {
    'January': 'jan',
    'February': 'feb',
    'March': 'mar',
    'April': 'apr',
    'May': 'may',
    'June': 'jun',
    'July': 'jul',
    'August': 'aug',
    'September': 'sep',
    'October': 'oct',
    'November': 'nov',
    'December': 'dec'
}

ids = [389195, 138620, 397135, 8200]
for topic in topics:
    for news_id in news_ids:
        print(topic, news_id)
        path = './news_topics/{}_{}/{}'.format(news_id, topic, year)
        Path(path).mkdir(parents=True, exist_ok=True)
        news_for_topic_and_id = news_df.loc[(news_df.topic == topic) & (news_df.source_id == news_id)]
        for month in months:
            news_for_topic_and_id_and_month = news_for_topic_and_id[news_for_topic_and_id.Date.str.contains(month)]
            if (len(news_for_topic_and_id_and_month) == 0):
                continue
            news_for_topic_and_id_and_month.to_csv(os.path.join(path, '{}-{} {}_{}.csv'.format(month_short_name[month], year, news_id, topic)))

# Phrase Split

In [72]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_short_name = {
    'January': 'jan',
    'February': 'feb',
    'March': 'mar',
    'April': 'apr',
    'May': 'may',
    'June': 'jun',
    'July': 'jul',
    'August': 'aug',
    'September': 'sep',
    'October': 'oct',
    'November': 'nov',
    'December': 'dec'
}

topics = ['EU', 'non_EU']
ids = [389195, 138620, 397135, 8200]
for topic in topics:
    for news_id in ids:
        print(topic, news_id)
        path = './news_topics/{}_{}/{}'.format(news_id, topic, year)
        Path(path).mkdir(parents=True, exist_ok=True)
        news_for_topic_and_id = news_df.loc[(news_df.eu == topic) & (news_df.source_id == news_id)]
        for month in months:
            news_for_topic_and_id_and_month = news_for_topic_and_id[news_for_topic_and_id.Date.str.contains(month)]
            if (len(news_for_topic_and_id_and_month) == 0):
                continue
            news_for_topic_and_id_and_month.to_csv(os.path.join(path, '{}-{} {}_{}.csv'.format(month_short_name[month], year, news_id, topic)))

EU 389195
EU 138620
EU 397135
EU 8200
non_EU 389195
non_EU 138620
non_EU 397135
non_EU 8200


In [58]:
rows = []

for topic in topics:
    for news_id in news_ids:
        row = [id_to_name_map[news_id]+ ' ' +topic, str(news_id)+'_'+topic, 'Nexis', '', '']
        rows.append(row)
        print(row)
        
df = pd.DataFrame(rows, columns=['Source Name', 'SourceId', 'Database', 'Start', 'End'])
df

['Belfast Telegraph EU', '400553_EU', 'Nexis', '', '']
['The Scotsman EU', '377101_EU', 'Nexis', '', '']
['Daily Record EU', '418973_EU', 'Nexis', '', '']
['Wales on Sunday EU', '244365_EU', 'Nexis', '', '']
['Independent EU', '8200_EU', 'Nexis', '', '']
['Wales Online EU', '412338_EU', 'Nexis', '', '']
['Mail on Sunday EU', '138794_EU', 'Nexis', '', '']
['Sunday Express EU', '232241_EU', 'Nexis', '', '']
['Sunday Telegraph EU', '334988_EU', 'Nexis', '', '']
['Sunday Sun EU', '331369_EU', 'Nexis', '', '']
['Guardian EU', '138620_EU', 'Nexis', '', '']
['Mirror EU', '419001_EU', 'Nexis', '', '']
['Guardian Weekly EU', '8010_EU', 'Nexis', '', '']
['The Herald EU', '142728_EU', 'Nexis', '', '']
['Express Online EU', '408506_EU', 'Nexis', '', '']
['The Observer EU', '143296_EU', 'Nexis', '', '']
['Daily Star Sunday EU', '363952_EU', 'Nexis', '', '']
['The People EU', '145251_EU', 'Nexis', '', '']
['The Express EU', '232240_EU', 'Nexis', '', '']
['Daily Record and Sunday Mail EU', '145253_EU

Unnamed: 0,Source Name,SourceId,Database,Start,End
0,Belfast Telegraph EU,400553_EU,Nexis,,
1,The Scotsman EU,377101_EU,Nexis,,
2,Daily Record EU,418973_EU,Nexis,,
3,Wales on Sunday EU,244365_EU,Nexis,,
4,Independent EU,8200_EU,Nexis,,
5,Wales Online EU,412338_EU,Nexis,,
6,Mail on Sunday EU,138794_EU,Nexis,,
7,Sunday Express EU,232241_EU,Nexis,,
8,Sunday Telegraph EU,334988_EU,Nexis,,
9,Sunday Sun EU,331369_EU,Nexis,,


In [126]:
df.to_csv('./sources2.csv')