In [14]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

In [15]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [16]:
bbc_id = 54
year = 2016
excluding = '_no_Others'
years = [2016]
df_list = []

In [19]:
for year in years:
    print(year)
    df_list.append(pd.read_csv('../data/partitions/bert_partitions_{}_{}.csv'.format(bbc_id, year)))
    
transcripts = pd.concat(df_list)
transcripts = transcripts.drop(['Unnamed: 0'], axis=1)

2016


In [18]:
transcripts = transcripts.loc[(transcripts.Date.str.contains('may')) | (transcripts.Date.str.contains('apr'))]
# program_names = ['BBC News at One', 'BBC News at Six', 'BBC News at Ten']
# transcripts = transcripts.loc[transcripts["Program Name"].isin(program_names)]
# transcripts

In [34]:
# transcripts = pd.read_csv('../data/partitions/bert_partitions_{}_{}.csv'.format(bbc_id, year))
# transcripts = transcripts.drop(['Unnamed: 0'], axis=1)

In [35]:
# transcripts = transcripts.dropna(subset=['Transcript', 'partitioned_transcript'])
# transcripts

In [20]:
partitions = []
for index, row in transcripts.iterrows():
    partition_string = row['partitioned_transcript']
    partition_date = row['Date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))
partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [21]:
# partition_df = pd.read_csv('./data/partitions/ibm_partition_2015_July_19.csv')

In [22]:
partition_df

Unnamed: 0,partition_id,date,transcript
0,0,4-jun-2016,we would have seen during this weekend and the...
1,0,4-jun-2016,manage that and make sure we stay within the g...
2,0,4-jun-2016,We start at the desk where you have the three ...
3,0,4-jun-2016,It helps to correct this atmosphere which is v...
4,0,4-jun-2016,Twitter’s live streaming video service announc...
...,...,...,...
204983,1506,30-dec-2016,"The former Bishop of Liverpool, JamesJones, ch..."
204984,1506,30-dec-2016,Southern rail passengers have been warned that...
204985,1506,30-dec-2016,"So on the eve of the new year, | make this cha..."
204986,1506,30-dec-2016,"It too says it’s willing to talk but, once aga..."


In [10]:
partition_df.to_csv('bbc_54_may_apr_partitions.csv')

In [44]:
len(partition_df.loc[partition_df.date.str.contains('jan-2014')]['partition_id'].unique())

44

In [18]:
from joblib import dump, load

enc = load('../models/doc2vec/encoder_{}{}.joblib'.format(year, excluding))

def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = []
    for i, p in enumerate(pred[0]):
        one_hot = np.zeros(len(pred[0]))
        one_hot[i] = 1
        pred_i += [(enc.inverse_transform([one_hot])[0][0], p)]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

def predict_with_window(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
    partition_vector = np.array(partition.vector)
    partition_vector = partition_vector.reshape(1, 100)
    articles_vector = articles_window.vector.values
    vec = [articles_vector[i] for i in range(len(articles_vector))]
    vec = np.array(vec)
    sim = cosine_similarity(partition_vector, vec)
    max_index = np.argmax(sim)
    
    return articles_window.iloc[max_index]['top1_topic']

def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [19]:
doc2vec_model = Doc2Vec.load('../models/doc2vec/doc2vec_news_{}{}'.format(year, excluding))
classifier = build_network()
classifier.load_weights('../models/doc2vec/news_classifier_{}{}'.format(year, excluding))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f896adf1d10>

In [20]:
partition_df['date'] = pd.to_datetime(partition_df['date'])

In [21]:
transcripts = partition_df.transcript.values
predict(transcripts[55])

[('Health services and medicine', 0.7798618),
 ('Culture, media and sport', 0.1451124),
 ('International affairs', 0.038704604)]

In [22]:
transcripts[55]

'The health watchdog, the Care Quality Commission, has apologised to family doctors after a BBC investigation revealed they wrongly classified the risk rating of hundreds of surgeries in England, including wrongly suggesting 60 were of the highest concern. This GP practice in Buckinghamshire was shocked to be branded high risk on a new online database, published by the watchdog the COC. Now, after a BBC investigation which prompted an embarrassing correction, it has been moved to a low risk category. Our patients, seeing that kind of information in the public Main, it would have dented their confidence in us as a practice, and their confidence in us as their GP. Designed to let a ship find out more about their GPs, more than 7000 surgeries in England were placed in six bands according to risk, raced on indicators including ease of appointments, success in diagnosing dementia and levels of heart disease. Now, the COC has admitted that 60 surgeries were wrongly given poor ratings, becaus

In [13]:
predictions = []
from tqdm import tqdm
for transcript in tqdm(transcripts):
    predictions.append(predict(transcript))

  0%|          | 0/207 [00:00<?, ?it/s]


NameError: name 'predict' is not defined

In [11]:
partition_df['topic'] = predictions

NameError: name 'predictions' is not defined

In [12]:
partition_df

Unnamed: 0,partition_id,date,transcript
0,521,2-apr-2016,between them created an artificially intellige...
1,521,2-apr-2016,The Xbox connects we are using in this space w...
2,521,2-apr-2016,Our home state of Arizona is overrun by home s...
3,521,2-apr-2016,The goal was to blur the lines between what’s ...
4,521,2-apr-2016,Don’t forget you can still immerse Don’t forge...
...,...,...,...
11860,1284,31-may-2016,What to know what a Donald Trump residency wou...
11861,1284,31-may-2016,We will be talking to one of its foreign polic...
11862,1284,31-may-2016,Tonight we reveal startling new figures showin...
11863,1284,31-may-2016,"Along with more affordable homes, Sadiq Khan h..."


In [None]:
# partition_df = partition_df.drop(['Unnamed: 0'], axis=1)

In [None]:
partition_df

In [None]:
# partition_df.to_csv('bbc_predictions_{}_{}.csv'.format(bbc_id, year))
partition_df.to_csv('bbc_predictions_News_at_One_Six_Ten.csv')