In [18]:
import pandas as pd
from pathlib import Path
import ast
import os
from nltk.corpus import stopwords
import nltk
import calendar

In [19]:
stopW = stopwords.words('english')

def contains_european(x):
    y = x.lower()
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

In [20]:
year = 2014
bbc_id = 54

In [21]:
df1 = pd.read_csv('../data/partition_predictions/window_topic_predictions_Jan-June_{}.csv'.format(year))
df2 = pd.read_csv('../data/partition_predictions/window_topic_predictions_Jul-Dec_{}.csv'.format(year))
# df3 = pd.read_csv('../data/partition_predictions/window_topic_predictions_Sep-Dec_{}.csv'.format(year))
partition_df = pd.concat([df1, df2])
partition_df = partition_df.drop(['vector', 'Unnamed: 0'], axis=1)
partition_df.columns = ['partition_id', 'date', 'Transcript', 'topic']
partition_df['topic'] = partition_df.apply(lambda x: "_".join(x.topic.replace(',', '').split(" ")), axis=1)
partition_df['date'] = pd.to_datetime(partition_df['date'])
partition_df['month'] = partition_df['date'].apply(lambda x: x.strftime('%b').lower())

In [22]:
partition_df

Unnamed: 0,partition_id,date,Transcript,topic,month
0,435,2014-03-01,Lofty will stay with us for a few days injamie...,Communities_and_families,mar
1,435,2014-03-01,Today I was led to believe that a patient I ha...,Parliament_government_and_politics,mar
2,435,2014-03-01,I want you think very carefully before you say...,International_affairs,mar
3,435,2014-03-01,"Also tonight, Ed Miliband’s plans to reform La...",Parliament_government_and_politics,mar
4,435,2014-03-01,"And Newcastle’s manager, Alan Pardew, is sent ...",Parliament_government_and_politics,mar
...,...,...,...,...,...
18958,730,2014-11-25,"Well, you can buy a helmet very similar to the...",Culture_media_and_sport,nov
18959,730,2014-11-25,Manchester City have kept alive their slim cha...,Culture_media_and_sport,nov
18960,730,2014-11-25,A win over Roma in their final match could be ...,Culture_media_and_sport,nov
18961,730,2014-11-25,A coroner has ruled that a problem with the fr...,Health_services_and_medicine,nov


In [23]:
topics = partition_df.topic.unique()
topics

array(['Communities_and_families', 'Parliament_government_and_politics',
       'International_affairs', 'Culture_media_and_sport',
       'Health_services_and_medicine', 'Transport',
       'Business_industry_and_consumers', 'Energy_and_environment',
       'Defence', 'Crime_civil_law_justice_and_rights',
       'Agriculture_animals_food_and_rural_affairs',
       'Economy_and_finance', 'Education',
       'Asylum_immigration_and_nationality', 'Employment_and_training',
       'European_Union', 'Housing_and_planning', 'Science_and_technology',
       'Social_services'], dtype=object)

# split all topics

In [24]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
for topic in topics:
    path = '{}_{}/{}/transcripts'.format(bbc_id, topic, year)
    Path(path).mkdir(parents=True, exist_ok=True)
    partition_for_topic = partition_df[partition_df.topic == topic]
    
    for month in months:
        partition_for_topic_and_month = partition_for_topic.loc[partition_for_topic.month == month]
        if (len(partition_for_topic_and_month) == 0):
            continue
        partition_for_topic_and_month.to_csv(os.path.join(path, 'BBC+News {}-{} {}_{}.csv'.format(month,
                                                                                                  year,
                                                                                                  bbc_id,
                                                                                                  topic)))

# split eu-non-eu

In [20]:
partition_df['eu_topic'] = partition_df['topic'].apply(lambda x: "EU" if x == 'European Union' else "non_EU")

for month in months:

    df_year_month = partition_df.loc[partition_df.month == month]
    df_year_month_eu = df_year_month.loc[df_year_month.eu_topic == "EU"].reset_index(drop=True)
    df_year_month_non_eu = df_year_month.loc[df_year_month.eu_topic == "non_EU"].reset_index(drop=True)

    directory = '{}_{}/{}/transcripts'.format(bbc_id, 'EU', year)
    path = '{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'EU', year, month, year, bbc_id, 'EU')

    if not os.path.exists(directory):
        os.makedirs(directory)
    df_year_month_eu.to_csv(path)

    directory = '{}_{}/{}/transcripts'.format(bbc_id, 'non_EU', year)
    path = '{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'non_EU', year, month, year, bbc_id, 'non_EU')

    if not os.path.exists(directory):
        os.makedirs(directory)
    df_year_month_non_eu.to_csv(path)

# split eu-non-eu phrase

In [21]:
partition_df['eu_phrase'] = partition_df['Transcript'].apply(lambda x: contains_european(x))
for month in months:

    df_year_month = partition_df.loc[partition_df.month == month]
    df_year_month_eu = df_year_month.loc[df_year_month.eu_phrase == "EU"].reset_index(drop=True)
    df_year_month_non_eu = df_year_month.loc[df_year_month.eu_phrase == "non_EU"].reset_index(drop=True)

    directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'EU', year)
    path = './phrase/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'EU', year, month, year, bbc_id, 'EU')

    if not os.path.exists(directory):
        os.makedirs(directory)
    df_year_month_eu.to_csv(path)

    directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'non_EU', year)
    path = './phrase/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'non_EU', year, month, year, bbc_id, 'non_EU')

    if not os.path.exists(directory):
        os.makedirs(directory)
    df_year_month_non_eu.to_csv(path)