In [433]:
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
import nltk
import string
import ast
import os

In [434]:
stopW = stopwords.words('english')

table = str.maketrans(dict.fromkeys(string.punctuation))

def contains_european(x):
    y = x.lower()
    y = y.translate(table)
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
#     flag = flag or 'vote' in words or 'referendum' in words or 'brexit' in words or 'leave' in words or 'remain' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

def eu_breakdown(x):
    if x['eu_phrase'] == 'EU':
        return 'EU'

    return eu_map_partition_ids[x['partition_id']]

def get_partition_df(bbc_id, year, window=False):
    if window:
        partition_df = pd.read_csv('../data/partition_predictions/window_topic_prediction_with_short_sentences_merged_{}_{}.csv'.format(bbc_id, year))
        partition_df = partition_df.drop(['Unnamed: 0', 'type'], axis=1)
        partition_df['topic'] = partition_df.apply(lambda x: "_".join(x.topic.replace(',', '').split(" ")), axis=1)
        partition_df['eu_phrase'] = partition_df['transcript'].apply(lambda x: contains_european(x))
        partition_df.columns=['partition_id', 'date', 'source', 'Transcript', 'topic', 'eu_phrase']
    else:
        partition_df = pd.read_csv('../data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_{}.csv'.format(bbc_id, year, excluding))
        partition_df = partition_df.drop(['Unnamed: 0'], axis=1)
        partition_df['topic'] = partition_df.apply(lambda x: ast.literal_eval(x['topic']), axis=1)
        partition_df['topic'] = partition_df.apply(lambda x: x['topic'][0][0], axis=1)
        partition_df['topic'] = partition_df.apply(lambda x: "_".join(x.topic.replace(',', '').split(" ")), axis=1)
        partition_df['eu_phrase'] = partition_df['transcript'].apply(lambda x: contains_european(x))
        partition_df['length'] = partition_df['transcript'].apply(lambda x: len(x.split()))
        partition_df.columns=['partition_id', 'date', 'Transcript', 'topic', 'eu_phrase', 'length']

    return partition_df

def get_eu_partition_map(partition_df):
    partition_ids = list(partition_df['partition_id'].unique())

    eu_map_partition_ids = {}

    for _id in partition_ids:
        df_id = partition_df.loc[partition_df.partition_id == _id]
        eu_phrase = list(df_id['eu_phrase'].unique())

        if 'EU' in eu_phrase and 'non_EU' in eu_phrase:
            eu_map_partition_ids[_id] = 'non_EU2'
        elif 'EU' in eu_phrase and 'non_EU' not in eu_phrase:
            eu_map_partition_ids[_id] = 'EU'
        elif 'non_EU' in eu_phrase and 'EU' not in eu_phrase:
            eu_map_partition_ids[_id] = 'non_EU1'
            
    return eu_map_partition_ids

# Config

In [445]:
name_map = {
    54: 'BBC+News',
    106: 'News',
    107: 'News',
    175: 'News',
    279: ''
}

In [446]:
bbc_ids = [279]
years = [2014, 2015, 2016, 2017, 2018]
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
excluding = 'no_Others'

In [447]:
for bbc_id in bbc_ids:
    for year in years:
        print(bbc_id, year)
        partition_df = get_partition_df(bbc_id, year, window=False)
#         display(partition_df)
        eu_map_partition_ids = get_eu_partition_map(partition_df)
        partition_df['date'] = pd.to_datetime(partition_df['date'])
        partition_df = partition_df.sort_values(by=['date'])
        partition_df['european_union2'] = partition_df.apply(lambda x:eu_breakdown(x), axis=1)
        partition_df['month'] = partition_df['date'].apply(lambda x: x.month)
        topics = partition_df['topic'].unique()
        
        # split by topic
        for topic in topics:
            path = './bbc_topic_split/{}_{}/{}/transcripts'.format(bbc_id, topic, year)
            Path(path).mkdir(parents=True, exist_ok=True)
            partition_for_topic = partition_df[partition_df.topic == topic]

            for month in range(1, 13):
                partition_for_topic_and_month = partition_for_topic[partition_for_topic.month == month]
                if (len(partition_for_topic_and_month) == 0):
                    continue
                partition_for_topic_and_month.to_csv(os.path.join(path, '{} {}-{} {}_{}.csv'.format(name_map[bbc_id], 
                                                                                                    months[month-1],
                                                                                                    year,
                                                                                                    bbc_id,
                                                                                                    topic)))
                
        # split by eu phrase
        partition_df['month'] = partition_df['date'].apply(lambda x: x.strftime('%b').lower())
        for month in months:

            df_year_month = partition_df.loc[partition_df.month == month]
            df_year_month_eu = df_year_month.loc[df_year_month.eu_phrase == "EU"].reset_index(drop=True)
            df_year_month_non_eu = df_year_month.loc[df_year_month.eu_phrase == "non_EU"].reset_index(drop=True)
            df_year_month_non_eu1 = df_year_month.loc[df_year_month.european_union2 == "non_EU1"].reset_index(drop=True)
            df_year_month_non_eu2 = df_year_month.loc[df_year_month.european_union2 == "non_EU2"].reset_index(drop=True)

            directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'EU', year)
            path = './phrase/{}_{}/{}/transcripts/{} {}-{} {}_{}.csv'.format(bbc_id, 'EU', 
                                                                             year, name_map[bbc_id], 
                                                                             month, year, bbc_id, 'EU')

            if not os.path.exists(directory):
                os.makedirs(directory)
            df_year_month_eu.to_csv(path)

            directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'non_EU', year)
            path = './phrase/{}_{}/{}/transcripts/{} {}-{} {}_{}.csv'.format(bbc_id, 'non_EU', 
                                                                              year, name_map[bbc_id],
                                                                             month, year, bbc_id, 'non_EU')

            if not os.path.exists(directory):
                os.makedirs(directory)
            df_year_month_non_eu.to_csv(path)

            directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'non_EU1', year)
            path = './phrase/{}_{}/{}/transcripts/{} {}-{} {}_{}.csv'.format( bbc_id, 'non_EU1', 
                                                                              year, name_map[bbc_id],
                                                                             month, year, bbc_id, 'non_EU1')

            if not os.path.exists(directory):
                os.makedirs(directory)
            df_year_month_non_eu1.to_csv(path)

            directory = './phrase/{}_{}/{}/transcripts'.format(bbc_id, 'non_EU2', year)
            path = './phrase/{}_{}/{}/transcripts/{} {}-{} {}_{}.csv'.format( bbc_id, 'non_EU2', 
                                                                             year, name_map[bbc_id],
                                                                             month, year, bbc_id, 'non_EU2')

            if not os.path.exists(directory):
                os.makedirs(directory)
            df_year_month_non_eu2.to_csv(path)

279 2014
279 2015
279 2016
279 2017
279 2018
