In [3]:
import pandas as pd
from pathlib import Path
import ast
import os
from nltk.corpus import stopwords
import nltk
import calendar

In [4]:
stopW = stopwords.words('english')

def contains_european(x):
    y = x.lower()
    words = y.split()
    flag = 'e.u' in words or 'eu' in words or 'europeanunion' in words
    words = [word for word in words if word not in stopW]
    bigrams = list(nltk.bigrams(words))
    bigrams = [bigram[0] + '.' + bigram[1] for bigram in bigrams]
    
    flag = flag or 'european.union' in bigrams or 'europe.union' in bigrams
    
    return 'EU' if flag else 'non_EU'

# SPLIT

In [5]:
bbc_id = 54
years = list(range(2014, 2019))

sources_to_split = ['BBC News at One', 'BBC News at Six', 'BBC News at Ten']
sources_to_exclude = ['Joins BBC News']

# 3 months grouped

In [6]:
for year in years:
    print(year)
    
    df = pd.read_csv('../data/partitions/bert_partitions_{}_{}.csv'.format(bbc_id, year))

    df = df.drop(['partitioned_transcript'], axis=1)
    df = df.loc[df['Program Name'].isin(sources_to_split)]

    months = [['jan', 'feb', 'mar'], ['apr', 'may', 'jun'], ['jul', 'aug', 'sep'], ['oct', 'nov', 'dec']]
    
    for month in range(1, 13):
        
        abbr = calendar.month_abbr[month].lower()
        group = None
        
        for m in months:
            if abbr in m:
                group = m
        
        assert(len(group) == 3)
        
        df_month1 = df.loc[df.Date.str.contains(group[0])]
        df_month2 = df.loc[df.Date.str.contains(group[1])]
        df_month3 = df.loc[df.Date.str.contains(group[2])]
        df_month = pd.concat([df_month1, df_month2, df_month3])
             
        directory = './transcripts_split/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten', year)
        path = './transcripts_split/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten')

        if not os.path.exists(directory):
            os.makedirs(directory)
        df_month.to_csv(path)

2014
2015
2016
2017
2018


# Monthly Split

In [10]:
counts = []
for year in years:
    print(year)
    
    df = pd.read_csv('../data/partitions/bert_partitions_{}_{}.csv'.format(bbc_id, year))
#     print(df['Program Name'].unique())

    df = df.drop(['partitioned_transcript'], axis=1)
    df = df.loc[df['Program Name'].isin(sources_to_split)]
#     display(df['Program Name'].value_counts())
    for month in range(1, 13):
        abbr = calendar.month_abbr[month].lower()
        
        count = [year, abbr]
        
        df_month = df.loc[df.Date.str.contains(abbr)]
        value_counts = df_month['Program Name'].value_counts()
        
#         ten = 0 
#         six = 0
#         one = 0
#         weekend = 0
        
#         if 'BBC News at Ten' in value_counts.keys():
#             ten = value_counts['BBC News at Ten']
        
#         if 'BBC News at One' in value_counts.keys():
#             one = value_counts['BBC News at One']
        
#         if 'BBC News at Six' in value_counts.keys():
#             six = value_counts['BBC News at Six']
            
#         if 'BBC Weekend News' in value_counts.keys():
#             weekend = value_counts['BBC Weekend News']
#         count.extend([ten, six, one, weekend])
        
        directory = './transcripts_split/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_Weekend', year)
        path = './transcripts_split/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_Weekend', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten_Weekend')

        if not os.path.exists(directory):
            os.makedirs(directory)
#         counts.append(count)
        df_month.to_csv(path)
    
df = pd.DataFrame(counts, columns=['year', 'month', 'News at One', 'News at Six', 'News at Ten', 'Weekend News'])
df

2014
2015
2016
2017
2018


Unnamed: 0,year,month,News at One,News at Six,News at Ten,Weekend News


In [11]:
df.to_csv('news_at_one_six_ten_weekend_count.csv')

## SPLIT COUNT

In [469]:
year_count_df = []
for year in years:
    print(year)
    
    df = pd.read_csv('../data/partitions/bert_partitions_{}_{}.csv'.format(bbc_id, year))
#     print(df['Program Name'].unique())

    df = df.drop(['partitioned_transcript'], axis=1)
    df = df.loc[~df['Program Name'].isin(sources_to_exclude)]
    df['Date'] = pd.to_datetime(df['Date'])
    df['month'] = df['Date'].apply(lambda x: x.month)
    df['year'] = df['Date'].apply(lambda x: x.year)
    df['word_count'] = df['Transcript'].apply(lambda x: len(x.split()))
    year_count_df.append(df[['Program Name', 'month', 'year', 
                'word_count']].groupby(['Program Name', 'month', 'year'], as_index=False).sum())

count_df = pd.concat(year_count_df)
count_df = count_df.sort_values(by=['year', 'month', 'Program Name'])
count_df

2004


FileNotFoundError: [Errno 2] File ../data/partitions/bert_partitions_54_2004.csv does not exist: '../data/partitions/bert_partitions_54_2004.csv'

In [455]:
count_df.to_csv('BBC_News_count_without_bbc_Joins_by_Program_Name.csv')

# SPLIT BASED ON EU PHRASES

In [456]:
df = pd.read_csv('bbc_predictions_News_at_One_Six_Ten.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x)[0][0])
df.columns = ['partition_id', 'date', 'Transcript', 'topic']
df['eu_topic'] = df['topic'].apply(lambda x: "EU" if x == 'European Union' else "non_EU")
df['eu_phrase'] = df['Transcript'].apply(lambda x: contains_european(x))
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].apply(lambda x: x.month)
df['year'] = df['date'].apply(lambda x: x.year)
df

Unnamed: 0,partition_id,date,Transcript,topic,eu_topic,eu_phrase,month,year
0,3,2014-12-05,Serious mistakes are made by England’s health ...,"Crime, civil law, justice and rights",non_EU,non_EU,12,2014
1,3,2014-12-05,The drink-drive limit in Scotland is from toda...,"Parliament, government and politics",non_EU,non_EU,12,2014
2,3,2014-12-05,One of our Marbles is missing- a statue that’s...,International affairs,non_EU,non_EU,12,2014
3,3,2014-12-05,And fly me to the moon and Mars - the unmanned...,"Parliament, government and politics",non_EU,non_EU,12,2014
4,3,2014-12-05,And three leaders at a flagship Academy chain ...,"Parliament, government and politics",non_EU,non_EU,12,2014
...,...,...,...,...,...,...,...,...
255029,2123,2018-01-31,"For the second time in a matter of days, resid...","Parliament, government and politics",non_EU,non_EU,1,2018
255030,2123,2018-01-31,"This time Goldhawk Road, Shepherd’s Bush, whic...","Parliament, government and politics",non_EU,non_EU,1,2018
255031,2123,2018-01-31,"I mean, the water is still sort of flowing in,...","Parliament, government and politics",non_EU,non_EU,1,2018
255032,2123,2018-01-31,We had to close down the studio and then we ha...,"Culture, media and sport",non_EU,non_EU,1,2018


In [461]:
res_df = df[['partition_id', 'month', 'year']].groupby(['month', 'year']).partition_id.nunique()

In [467]:
res_df.to_frame().to_csv('News_at_1_6_10_average_partition.csv')

In [430]:
pd.crosstab(df['eu_topic'], df['eu_phrase'])

eu_phrase,EU,non_EU
eu_topic,Unnamed: 1_level_1,Unnamed: 2_level_1
EU,3475,2103
non_EU,7175,242281


In [397]:
# bbc_id = 54
# year = 2014

# sources_to_split = ['BBC News at One', 'BBC News at Six', 'BBC News at Ten']

In [398]:
# data_path = '../data/bbc/{}/{}/transcripts'.format(bbc_id, year)
# output_path = '../data/transcripts_split'

In [399]:
# csv_files = os.listdir(data_path)
# csv_files = [csv_file for csv_file in csv_files if csv_file.endswith('.csv')]
# csv_files

In [400]:
# for csv_file in csv_files:
#     csv_path = os.path.join(data_path, csv_file)
#     print(csv_file.split())
#     df = pd.read_csv(csv_path)
#     month = csv_file.split()[1]
#     df_at_one_six_ten = df.loc[df['Program Name'].isin(sources_to_split)]
    
#     df_at_one_six_ten['eu'] = df_at_one_six_ten['Transcript'].apply(lambda x: contains_european(x))
    
#     df_at_one_six_ten_eu = df_at_one_six_ten.loc[df_at_one_six_ten.eu == 'EU']
#     df_at_one_six_ten_non_eu = df_at_one_six_ten.loc[df_at_one_six_ten.eu == 'non_EU']
    
#     directory = './transcripts_split_with_phrases/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_EU', year)
#     path = './transcripts_split_with_phrases/{}_{}/{}/transcripts/BBC+News {} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_EU', year, month, bbc_id, 'News_at_One_Six_Ten_EU')
    
#     if not os.path.exists(directory):
#         os.makedirs(directory)
        
#     df_at_one_six_ten_eu.to_csv(path)
    
#     directory = './transcripts_split_with_phrases/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year)
#     path = './transcripts_split_with_phrases/{}_{}/{}/transcripts/BBC+News {} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year, month, bbc_id, 'News_at_One_Six_Ten_non_EU')
    
#     if not os.path.exists(directory):
#         os.makedirs(directory)
        
#     df_at_one_six_ten_non_eu.to_csv(path)
    
#     directory = './transcripts_split/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten', year)
#     path = './transcripts_split/{}_{}/{}/transcripts/BBC+News {} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten', year, month, bbc_id, 'News_at_One_Six_Ten')
    
#     if not os.path.exists(directory):
#         os.makedirs(directory)
        
#     df_at_one_six_ten.to_csv(path)

In [428]:
df[['partition_id', 'month', 'year']].groupby(['month', 'year'], as_index=False).agg({'partition_id': 'nunique'})

Unnamed: 0,month,year,partition_id
0,1,2014,44
1,1,2015,61
2,1,2016,56
3,1,2017,66
4,1,2018,65
5,2,2014,53
6,2,2015,57
7,2,2016,56
8,2,2017,60
9,2,2018,59


In [429]:
for year in range(2014, 2019):
    print(year)
    for month in range(1, 13):
        
        df_year_month = df.loc[(df.year == year) & (df.month == month)]
        df_year_month_eu = df_year_month.loc[df_year_month.eu_phrase == "EU"].reset_index(drop=True)
        df_year_month_non_eu = df_year_month.loc[df_year_month.eu_phrase == "non_EU"].reset_index(drop=True)
        
        uniques_ids = list(set(df_year_month_eu['partition_id'].unique()) | set(df_year_month_non_eu['partition_id'].unique())) 
        print(year, month, len(df_year_month['partition_id'].unique()), len(uniques_ids))
        directory = './transcripts_split_with_phrase/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_EU', year)
        path = './transcripts_split_with_phrase/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_EU', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten_EU')

        if not os.path.exists(directory):
            os.makedirs(directory)
        df_year_month_eu.to_csv(path)
        
        directory = './transcripts_split_with_phrase/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year)
        path = './transcripts_split_with_phrase/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten_non_EU')

        if not os.path.exists(directory):
            os.makedirs(directory)
        df_year_month_non_eu.to_csv(path)

2014
2014 1 44 44
2014 2 53 53
2014 3 44 44
2014 4 43 43
2014 5 64 64
2014 6 63 63
2014 7 67 67
2014 8 61 61
2014 9 61 61
2014 10 56 56
2014 11 53 53
2014 12 52 52
2015
2015 1 61 61
2015 2 57 57
2015 3 65 65
2015 4 65 65
2015 5 61 61
2015 6 58 58
2015 7 61 61
2015 8 57 57
2015 9 64 64
2015 10 66 66
2015 11 62 62
2015 12 50 50
2016
2016 1 56 56
2016 2 56 56
2016 3 31 31
2016 4 11 11
2016 5 41 41
2016 6 55 55
2016 7 53 53
2016 8 67 67
2016 9 66 66
2016 10 62 62
2016 11 66 66
2016 12 51 51
2017
2017 1 66 66
2017 2 60 60
2017 3 68 68
2017 4 60 60
2017 5 65 65
2017 6 64 64
2017 7 62 62
2017 8 69 69
2017 9 63 63
2017 10 66 66
2017 11 66 66
2017 12 48 48
2018
2018 1 65 65
2018 2 59 59
2018 3 65 65
2018 4 59 59
2018 5 66 66
2018 6 54 54
2018 7 65 65
2018 8 60 60
2018 9 53 53
2018 10 59 59
2018 11 59 59
2018 12 45 45


# SPLIT BASED ON TOPIC PREDICTION

In [374]:
# df = pd.read_csv('bbc_predictions_News_at_One_Six_Ten.csv')
# df = df.drop(['Unnamed: 0'], axis=1)
# df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x)[0][0])
# df

In [375]:
# df['date'] = pd.to_datetime(df['date'])
# df['month'] = df['date'].apply(lambda x: x.month)
# df['year'] = df['date'].apply(lambda x: x.year)

In [376]:
# df['eu'] = df['topic'].apply(lambda x: 'EU' if x == 'European Union' else 'non_EU')
# df.columns = ['partition_id', 'date', 'Transcript', 'topic', 'month', 'year', 'eu']
# df

In [377]:
for year in range(2014, 2019):
    print(year)
    for month in range(1, 13):
        
        df_year_month_eu = df.loc[(df.year == year) & (df.month == month) & (df.eu_topic == 'EU')].reset_index(drop=True)
        df_year_month_non_eu = df.loc[(df.year == year) & (df.month == month) & (df.eu_topic == 'non_EU')].reset_index(drop=True)
        
        directory = './transcripts_split_with_classification/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_EU', year)
        path = './transcripts_split_with_classification/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_EU', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten_EU')

        if not os.path.exists(directory):
            os.makedirs(directory)
        df_year_month_eu.to_csv(path)
        
        directory = './transcripts_split_with_classification/{}_{}/{}/transcripts'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year)
        path = './transcripts_split_with_classification/{}_{}/{}/transcripts/BBC+News {}-{} {}_{}.csv'.format(bbc_id, 'News_at_One_Six_Ten_non_EU', year, calendar.month_abbr[month].lower(), year, bbc_id, 'News_at_One_Six_Ten_non_EU')

        if not os.path.exists(directory):
            os.makedirs(directory)
        df_year_month_non_eu.to_csv(path)

2014
2015
2016
2017
2018


# Counts EU-NON EU NEWS AT 1 6 10

In [359]:
split_type = 'classification'

In [360]:
non_eu_output_path = './transcripts_split_with_{}/54_News_at_One_Six_Ten_non_EU/{}/transcripts'
eu_output_path = './transcripts_split_with_{}/54_News_at_One_Six_Ten_EU/{}/transcripts'
years = [2014, 2015, 2016, 2017, 2018]

In [361]:
eu_df_list = []
non_eu_df_list = []

def get_df_from_list(df_list):
    df = pd.concat(df_list)
    df = df.drop(['Unnamed: 0'], axis=1)
    display(df)
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].apply(lambda x: x.month)
    df['year'] = df['date'].apply(lambda x: x.year)
    
    return df
    
for year in years:
    print(year)
    eu_path = eu_output_path.format(split_type, year)
    non_eu_path = non_eu_output_path.format(split_type, year)
    eu_csv_files = os.listdir(eu_path)
    non_eu_csv_files = os.listdir(non_eu_path)
    for csv_file in eu_csv_files:
        eu_df_list.append(pd.read_csv(os.path.join(eu_path, csv_file)))
    for csv_file in non_eu_csv_files:
        non_eu_df_list.append(pd.read_csv(os.path.join(non_eu_path, csv_file)))
        
eu_df = get_df_from_list(eu_df_list)
non_eu_df = get_df_from_list(non_eu_df_list)

2014
2015
2016
2017
2018


Unnamed: 0,partition_id,date,transcript,topic,eu_topic,eu_phrase,month,year
0,447,2014-03-04,An actor voices his words because he is too fr...,European Union,EU,non_EU,3,2014
1,449,2014-03-05,Economists have argued for years over the impa...,European Union,EU,non_EU,3,2014
2,463,2014-03-11,Details are emerging tonight of labour’s attit...,European Union,EU,non_EU,3,2014
3,466,2014-03-12,We report on the 3D printer that’s revolutioni...,European Union,EU,EU,3,2014
4,466,2014-03-12,His new policy took seven paragraphs and more ...,European Union,EU,EU,3,2014
...,...,...,...,...,...,...,...,...
230,1231,2018-11-15,This Bradford firm employs 1000 This Bradford ...,European Union,EU,EU,11,2018
231,1233,2018-11-15,Two Cabinet ministers are among those who’ve r...,European Union,EU,non_EU,11,2018
232,1233,2018-11-15,Overly concerned EU leaders do not appear conc...,European Union,EU,EU,11,2018
233,1233,2018-11-15,But Theresa May has been trying to present the...,European Union,EU,EU,11,2018


Unnamed: 0,partition_id,date,transcript,topic,eu_topic,eu_phrase,month,year
0,198,2014-08-05,Subtitles by Red Bee Media Ltd I’m just most e...,International affairs,non_EU,non_EU,8,2014
1,198,2014-08-05,Fighterjets escort a Qatar Airways plane in to...,"Parliament, government and politics",non_EU,non_EU,8,2014
2,198,2014-08-05,The mother from east London forced to leave he...,International affairs,non_EU,non_EU,8,2014
3,198,2014-08-05,But will a threat of a £20 fine get you to tur...,"Parliament, government and politics",non_EU,non_EU,8,2014
4,198,2014-08-05,Good evening and welcome to the BBC News at Si...,"Parliament, government and politics",non_EU,non_EU,8,2014
...,...,...,...,...,...,...,...,...
4984,902,2018-03-06,And Serena Williams tells us she’s ready to re...,"Culture, media and sport",non_EU,non_EU,3,2018
4985,902,2018-03-06,We’ll start with tonight’s Champions League ac...,"Parliament, government and politics",non_EU,non_EU,3,2018
4986,902,2018-03-06,Sadio Mane fame closest to adding air to chang...,"Culture, media and sport",non_EU,non_EU,3,2018
4987,902,2018-03-06,"For higher stakes in Paris, 3 giant For higher...","Parliament, government and politics",non_EU,non_EU,3,2018


In [362]:
eu_df

Unnamed: 0,partition_id,date,transcript,topic,eu_topic,eu_phrase,month,year
0,447,2014-03-04,An actor voices his words because he is too fr...,European Union,EU,non_EU,3,2014
1,449,2014-03-05,Economists have argued for years over the impa...,European Union,EU,non_EU,3,2014
2,463,2014-03-11,Details are emerging tonight of labour’s attit...,European Union,EU,non_EU,3,2014
3,466,2014-03-12,We report on the 3D printer that’s revolutioni...,European Union,EU,EU,3,2014
4,466,2014-03-12,His new policy took seven paragraphs and more ...,European Union,EU,EU,3,2014
...,...,...,...,...,...,...,...,...
230,1231,2018-11-15,This Bradford firm employs 1000 This Bradford ...,European Union,EU,EU,11,2018
231,1233,2018-11-15,Two Cabinet ministers are among those who’ve r...,European Union,EU,non_EU,11,2018
232,1233,2018-11-15,Overly concerned EU leaders do not appear conc...,European Union,EU,EU,11,2018
233,1233,2018-11-15,But Theresa May has been trying to present the...,European Union,EU,EU,11,2018


In [363]:
def get_count(df):
    df = df[['month', 'year']]
    df['count'] = 1
    res_df = df.groupby(['year', 'month'], as_index=False).sum()
    return res_df

eu_count = get_count(eu_df)
non_eu_count = get_count(non_eu_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [364]:
eu_count
non_eu_count

Unnamed: 0,year,month,count
0,2014,1,2965
1,2014,2,4029
2,2014,3,3233
3,2014,4,2953
4,2014,5,4622
5,2014,6,4442
6,2014,7,4821
7,2014,8,4001
8,2014,9,4208
9,2014,10,3986


In [365]:
eu_count.to_csv('News_at_1_6_10_{}_EU_count.csv'.format(split_type))
non_eu_count.to_csv('News_at_1_6_10_{}_non_EU_count.csv'.format(split_type))

# COUNTS NEWS AT 1 6 10

In [63]:
df.to_csv('./News_at_One_Six_Ten.csv')

In [43]:
df = df[['Source', 'month', 'year']]

In [45]:
df['count'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
df

Unnamed: 0,Source,month,year,count
0,BBC1 London,7,2015,1
1,BBC1 London,7,2015,1
2,BBC1 London,7,2015,1
3,BBC News 24,7,2015,1
4,BBC1 London,7,2015,1
...,...,...,...,...
61,BBC1 London,5,2018,1
62,BBC News 24,5,2018,1
63,BBC1 London,5,2018,1
64,BBC1 London,5,2018,1


In [48]:
res_df = df.groupby(['Source', 'month', 'year'], as_index=False).sum()

In [50]:
res_df = res_df.sort_values(by=['year', 'month'])

In [51]:
res_df.to_csv('counts_News_at_1_6_10.csv')

# COUNTS REPEATED TRANSCRIPTS

In [128]:
times = df.Time.unique()
for t in times:
    df_at_time = df.loc[df.Time == t]
    df_at_time = df_at_time.reset_index(drop=True)
    #display(df_at_time)
    #print(df_at_time.index)
    unique_dates = df_at_time.Date.unique()
    for unique_date in unique_dates:
        df_at_time_date = df_at_time.loc[df_at_time.Date == unique_date]
        if len(df_at_time_date) > 1:
            display(df_at_time_date)

In [138]:
unique_dates = df.Date.unique()
rows = []
for unique_date in unique_dates:
    df_date = df.loc[df.Date == unique_date]
    df_date = df_date.reset_index(drop=True)
    df_bbc_london = df_date.loc[df_date.Source == 'BBC1 London']
    df_bbc_news = df_date.loc[df_date.Source == 'BBC News 24']
    
    if(len(df_bbc_news) == 0 or len(df_bbc_london) == 0):
        continue
        
    for index, row in df_bbc_london.iterrows():
        for index2, row2 in df_bbc_news.iterrows():
            words1 = list(set(row['Transcript'].split()))
            words2 = list(set(row2['Transcript'].split()))
            words1 = [word for word in words1 if word not in stopwords.words('english')]
            words2 = [word for word in words2 if word not in stopwords.words('english')]
            common_words = list(set(words1).intersection(words2))
            print(len(words1), len(words2), len(common_words))
            entry = [row['Source'], row2['Source'], row['Date'], row['Program Name'], row2['Program Name'], row['Transcript'], row2['Transcript'], len(words1), len(words2), len(common_words)]
            rows.append(entry)
columns=['source1', 'source2', 'date', 'program name1', 'program name2', 'transcript1', 'transcript2', 'words1', 'words2', 'common words']
res_df = pd.DataFrame(rows, columns=columns)
res_df

1802 1610 519
1396 1610 793
1594 1707 565
1594 1736 588
1491 1581 840
1660 1854 646
1660 2002 636
1667 1672 685
1667 1841 602
1632 1710 665
1632 1782 650
1666 1672 663
1636 1672 1083
1663 1645 513
1671 1745 663
1671 1898 636
1704 1612 655
1653 1612 777
1696 1725 633
1696 1518 499
1570 1789 608
1527 1789 927
1588 1755 652
1837 1755 859
1655 1754 610
1413 1754 721
1881 1794 752
1630 1794 838
1718 1761 712
1581 1761 952
1823 1716 689
1823 1725 534
1756 1782 744
1552 1782 716
1162 1097 550
1371 1097 719
1411 934 383
1401 934 575
2185 1752 706
2185 1542 592
2087 1507 592
1622 1507 529
2220 1604 698
2220 1770 654
2191 1748 749
2191 1672 553
1984 1804 694
1984 1630 558
2351 1784 829
1633 1784 1065
2189 1619 685
2189 1507 649
2124 1620 658
2124 1619 642
2043 1701 708
2043 1638 673
2187 1769 753
1694 1769 745
2212 1577 602
1550 1577 943
2135 1583 688
2135 1624 611
2159 1749 768
2159 1706 584
2210 1818 786
2210 1877 764
2076 1684 703
1597 1684 859
1825 1670 610
1825 1617 546
2029 1533 600
2029 1

1731 1766 519
1803 1810 629
1842 1711 575
1618 1756 552
1671 1822 629
1671 1789 537
1690 1682 601
1690 1737 550
1630 1647 585
1630 1791 538
1576 1788 550
1576 1736 507
1654 1960 636
1654 1792 510
1704 1583 503
1629 1583 805
1760 1659 599
1748 1659 929
1790 1706 604
1789 1706 919
1773 1728 562
1805 1728 978
1855 1672 596
1855 1697 485
1814 1697 624
1725 1697 892
1849 1672 627
1839 1672 936
1713 1734 710
1758 1734 872
1670 1759 589
1544 1759 930
1842 1645 619
1774 1645 878
1842 1673 614
1842 1742 582
1740 1697 553
1793 1697 925
1614 1711 605
1614 1799 589
1639 1578 588
1639 1714 581
1697 1921 620
1842 1591 582
1842 1677 605
1654 1723 540
1632 1738 600
1761 1770 650
1679 1730 622
1679 1741 516
1704 1777 614
1418 1777 582
1665 1715 514
1668 1681 555
1668 1931 576
1634 1876 602
1634 1867 619
1696 1559 628
1696 1791 641
1681 1748 735
1681 1714 576
1711 1618 623
1711 1315 466
1542 1521 553
1542 1667 519
1602 1680 595
1602 1761 563
1592 1741 578
1592 1779 538
1686 1768 716
1686 1715 635
1744 1

Unnamed: 0,source1,source2,date,program name1,program name2,transcript1,transcript2,words1,words2,common words
0,BBC1 London,BBC News 24,2015-07-03,BBC News at One,BBC News at Six,"So, minus £28. You’ve been a great team. Great...",The Queen and the Duke of Edinburgh lead tribu...,1802,1610,519
1,BBC1 London,BBC News 24,2015-07-03,BBC News at Ten,BBC News at Six,The Queen and the Duke of Edinburgh lead tribu...,The Queen and the Duke of Edinburgh lead tribu...,1396,1610,793
2,BBC1 London,BBC News 24,2015-07-01,BBC News at One,BBC News at Six,Heathrow should get a third runway - says a lo...,"# Picture this. # Get a point of view, and tri...",1594,1707,565
3,BBC1 London,BBC News 24,2015-07-01,BBC News at One,BBC News at Ten,Heathrow should get a third runway - says a lo...,"# Oh, no # Well, my left hand’s free Tonight a...",1594,1736,588
4,BBC1 London,BBC News 24,2015-07-02,BBC News at Ten,BBC News at Six,"Luckily, there’s nothing we like more than cra...",The government sets out the case for airstrike...,1491,1581,840
...,...,...,...,...,...,...,...,...,...,...
1120,BBC1 London,BBC News 24,2018-05-24,BBC News at Ten,BBC News at Six,Hey! ‘Every year comes in weekly instalments. ...,"fresh in the north. If you have plans, don’t m...",1922,1841,1012
1121,BBC1 London,BBC News 24,2018-05-29,BBC News at One,BBC News at Ten,"Yes, Dad! There goes the postman... ..and the ...",Tonight at Ten... Italy on the brink of a majo...,1805,1891,546
1122,BBC1 London,BBC News 24,2018-05-29,BBC News at Six,BBC News at Ten,discovers the beauty of a country he first saw...,Tonight at Ten... Italy on the brink of a majo...,1883,1891,948
1123,BBC1 London,BBC News 24,2018-05-31,BBC News at One,BBC News at Ten,Women prepared to burn and bomb. They are goin...,"Tonight at Ten — fears of a trade war, as the ...",1916,1895,639


In [139]:
res_df.to_csv('News_at_One_Six_Ten_repeated_transcripts.csv')