In [2]:
import pandas as pd

df_BBC = pd.read_csv('BBC_raw.csv', sep=';')
len(df_BBC)

3803

In [3]:
df_BBC = df_BBC.drop_duplicates(subset = ['title'])
df_BBC.dropna(subset = ["synops_long"], inplace = True)
df_BBC = df_BBC.reset_index()
len(df_BBC)

1895

In [17]:
df_BBC.head()

Unnamed: 0,index,id,title,description,image,keywords,synopses_small,synops_med,synops_long,category,channel,language,release_date,duration_sec,topic
0,0,0,BBC Proms,Australian tenor Stuart Skelton is joined by L...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, BBC Proms, 2021: Last Night ...",Australian tenor Stuart Skelton is joined by L...,Australian tenor Stuart Skelton is joined by L...,Katie Derham hosts continued live coverage fro...,Music,bbc_radio_three,False,9pm 11 Sep 2021,5247,music
1,1,1,Port,"Previously unseen music from the series Port, ...",https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Port, Series 5: Episode 4","Previously unseen music from the series Port, ...",Julie Fowlis a’ lìbhrigeadh ceòl bhon t-sreath...,Bidh Druthag Bheag Eile a’ tarraing ri chèile ...,Music,bbc_alba,False,9 Jul 2020,1037,music
2,3,3,Ceiliúradh na Féile Pádraig,John Toal and Pauline Scanlon introduce an eve...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Ceiliúradh na Féile Pádraig",John Toal and Pauline Scanlon introduce an eve...,John Toal and Pauline Scanlon introduce an eve...,John Toal and Pauline Scanlon introduce an eve...,Music,bbc_two_northern_ireland_digital,False,17 Mar 2017,3538,music
3,4,4,Songs of Praise,Claire McCollum explores Edinburgh and shares ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Songs of Praise, Edinburgh",Claire McCollum explores Edinburgh and shares ...,Claire McCollum goes on a Christian heritage t...,Claire McCollum explores Edinburgh and discove...,Music,bbc_one,False,1:15pm 20 Feb 2022,2044,music
4,5,5,Radio 1's Out Out! Live,Music’s biggest names celebrate the return of ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Radio 1s Out Out! Live, Best...",Music’s biggest names celebrate the return of ...,BBC Radio 1 host a star-studded party from Wem...,BBC Radio 1 host a star-studded party from Wem...,Music,bbc_radio_one,False,11:30pm 17 Oct 2021,3423,music


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()
corpus = df_BBC.synops_long

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# compute and print the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix).round(2)
print(cosine_sim)

[[1.   0.02 0.03 ... 0.08 0.01 0.06]
 [0.02 1.   0.08 ... 0.02 0.08 0.03]
 [0.03 0.08 1.   ... 0.03 0.14 0.02]
 ...
 [0.08 0.02 0.03 ... 1.   0.01 0.08]
 [0.01 0.08 0.14 ... 0.01 1.   0.02]
 [0.06 0.03 0.02 ... 0.08 0.02 1.  ]]


In [19]:
cosine_sim.shape

(1895, 1895)

In [20]:
import numpy as np
np.min(cosine_sim)
np.max(cosine_sim)
#np.max(cosine_sim, axis = 0)

1.0

In [48]:
indices = pd.Series(df_BBC.index, index = df_BBC['title']).drop_duplicates()

def get_recommendations(title, cosine, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse = True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[0:9]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return df_BBC['title'].iloc[movie_indices]

In [49]:
tfidf = TfidfVectorizer(stop_words = 'english')
# Construct the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
# Generate the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Generate recommendations
print(get_recommendations("Planet Earth II", cosine_sim, indices))

312                     Planet Earth II
338                     The Mating Game
382             Wales: Land of the Wild
360            Earth’s Tropical Islands
311    Chris Packham's Animal Einsteins
358            Seven Worlds, One Planet
410                Animals with Cameras
294                    Earth from Space
418                           Zoo Quest
Name: title, dtype: object


In [50]:
alpha = 0.2
selection = get_recommendations("Planet Earth II", cosine_sim, indices).index.values
RelDiversity = (1-cosine_sim[list(selection)]).mean(axis = 0).round(2)
quality = (1 - alpha) * cosine_sim + alpha * RelDiversity
print(get_recommendations("Planet Earth II", quality, indices))

312                 Planet Earth II
382         Wales: Land of the Wild
963                 Worzel Gummidge
1760         World's Busiest Cities
429                 Born to Be Wild
338                 The Mating Game
99      Planet Earth: A Celebration
344                   Animal Babies
432               The Wild Gardener
Name: title, dtype: object


In [51]:
print(df_BBC['image'].iloc[0].format(recipe = "288x162"))

https://ichef.bbci.co.uk/images/ic/288x162/p09w0p59.jpg


In [54]:
list(get_recommendations("Planet Earth II", cosine_sim, indices).index)

[312, 338, 382, 360, 311, 358, 410, 294, 418]

In [164]:
ind = list(get_recommendations("Planet Earth II", cosine_sim, indices).index)
df_BBC.filter(items = ind, axis=0)

Unnamed: 0,index,id,title,description,image,keywords,synopses_small,synops_med,synops_long,category,channel,language,release_date,duration_sec,topic
312,365,365,Planet Earth II,Cities can be worlds of surprising opportunity...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Planet Earth II, 6. Cities",Cities can be worlds of surprising opportunity...,Cities can be worlds of surprising opportunity...,Cities are growing at a faster rate than any o...,Nature,bbc_one,False,11 Dec 2016,3515,science-and-nature
338,391,391,The Mating Game,Animals facing the greatest odds have the most...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, The Mating Game, Series 1: 5...",Animals facing the greatest odds have the most...,All life is driven by the need to breed. Yet f...,One basic need connects all life on earth - th...,Nature,bbc_one,False,8pm 31 Oct 2021,3479,science-and-nature
360,414,414,Earth’s Tropical Islands,"Exploring Hawaii, a tropical sanctuary for any...",https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Earth’s Tropical Islands, Se...","Exploring Hawaii, a tropical sanctuary for any...","Exploring Hawaii, a tropical sanctuary for any...",This is a journey across Hawaii’s varied islan...,Nature,bbc_two,False,3 Jan 2020,3551,science-and-nature
382,437,437,Wales: Land of the Wild,"An epic journey following Wales’s rivers, disc...",https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Wales: Land of the Wild, Sec...","An epic journey following Wales’s rivers, disc...",An epic journey following Wales’s rivers from ...,An epic journey following Wales’s rivers from ...,Documentary,bbc_one_wales,False,25 Mar 2021,3532,science-and-nature
311,364,364,Chris Packham's Animal Einsteins,Chris discovers the cleverest animal traveller...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Chris Packhams Animal Einste...",Chris discovers the cleverest animal traveller...,Chris discovers the cleverest animal traveller...,Chris Packham discovers the clever methods tha...,Nature,bbc_two,False,28 Mar 2021,3540,science-and-nature
294,346,346,Earth from Space,Cameras in space show Earth changing rapidly. ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Earth from Space, Series 1: ...",Cameras in space show Earth changing rapidly. ...,Cameras in space show that Earth is changing r...,Cameras in space tell stories of life on our p...,Science & Nature,bbc_one,False,8 May 2019,3461,science-and-nature
358,412,412,"Seven Worlds, One Planet",The most spectacular moments from the series.,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Seven Worlds, One Planet, Co...",The most spectacular moments from the series.,"The most spectacular moments from the series, ...",The most spectacular moments from the Seven Wo...,Science & Nature,bbc_one,False,1:05pm 1 Jan 2020,3475,science-and-nature
410,469,469,Animals with Cameras,State-of-the-art cameras offer a fresh look on...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Animals with Cameras, Series...",State-of-the-art cameras offer a fresh look on...,Wildlife cameraman Gordon Buchanan leads a tea...,Wildlife cameraman Gordon Buchanan fronts the ...,Nature,bbc_two,False,8pm 15 Sep 2021,3540,science-and-nature
99,117,117,Planet Earth: A Celebration,Escape with Sir David Attenborough to the wild...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Planet Earth: A Celebration",Escape with Sir David Attenborough to the wild...,Escape with Sir David Attenborough to the worl...,"While the pandemic restricts our movements, wi...",Nature,bbc_one,False,8pm 31 Aug 2020,3483,music


In [None]:
with open('similarity.npy', 'wb') as f:
    np.save(f, cosine_sim)

In [None]:
with open('similarity.npy', 'rb') as f:
    simsim = np.load(f)

In [89]:
set(df_BBC.category)
df_BBC[df_BBC.category.isin(["Arts", "Beauty"])]

Unnamed: 0,index,id,title,description,image,keywords,synopses_small,synops_med,synops_long,category,channel,language,release_date,duration_sec,topic
75,91,91,Anything Goes: The Musical,A major new production of the classic musical ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Anything Goes: The Musical",A major new production of the classic musical ...,"Filmed live at the Barbican in London, this ma...","Filmed live at the Barbican in London, this ma...",Arts,bbc_two,False,6:40pm 26 Dec 2021,8241,music
177,204,204,Dance: BBC Introducing Arts,Brenda Emmanus presents short films that combi...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Dance: BBC Introducing Arts",Brenda Emmanus presents short films that combi...,Brenda Emmanus presents short films from emerg...,Brenda Emmanus presents an innovative collecti...,Arts,bbc_four,Contains upsetting scenes.,10pm 20 Feb 2022,3540,arts
178,205,205,Truman and Tennessee: An Intimate Conversation,Docudrama that creates a fascinating compariso...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Truman and Tennessee: An Int...",Docudrama that creates a fascinating compariso...,Docudrama that builds a fascinating picture of...,Truman Capote and Tennessee Williams – writers...,Arts,bbc_four,False,9pm 7 Nov 2021,4803,arts
183,211,211,Ex-S,Jasper Conran designs the costumes for Swan La...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Ex-S, Styling the Swan",Jasper Conran designs the costumes for Swan La...,"First transmitted in 1995, Jasper Conran desig...","First transmitted in 1995, Scottish Ballet com...",Arts,bbc_webonly,False,28 Mar 1995,1744,arts
187,215,215,Big Book Weekend,Alex Clark talks to the queen of crime Val McD...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Big Book Weekend, A Big Book...",Alex Clark talks to the queen of crime Val McD...,Alex Clark talks to Val McDermid about the end...,Alex Clark talks to the queen of crime Val McD...,Arts,bbc_arts,False,31 Mar 2021,2790,arts
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,344,344,Dancing Nation,Highlights of world-class performances by the ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Dancing Nation, 2. Highlights",Highlights of world-class performances by the ...,"Highlights of Dancing Nation, a celebration of...","Highlights of Dancing Nation, a celebration of...",Arts,bbc_arts,Contains flashing images.,10:15pm 10 May 2021,3557,arts
790,960,960,Glow Up: Britain's Next Make-Up Star,The Glow Up final. An industry masterclass and...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Glow Up: Britains Next Make-...",The Glow Up final. An industry masterclass and...,"It’s the Glow Up final, and the MUAs must pres...","It’s the final of Glow Up, and three MUAs stil...",Beauty,bbc_three,False,10:45pm 8 Jun 2021,3520,lifestyle
1457,2317,2317,Tonight,The writer Dame Rebecca West in conversation w...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Tonight, Dame Rebecca West",The writer Dame Rebecca West in conversation w...,"First transmitted in 1976, the writer Dame Reb...","First transmitted in 1976, Dame Rebecca West r...",Arts,bbc_webonly,False,20 Sep 1976,1182,news
1517,2480,2480,John Buchan: Master of Suspense,Docudrama about the life of author John Buchan...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, John Buchan: Master of Suspense",Docudrama about the life of author John Buchan...,Unfairly known as a one-hit wonder for his nov...,Unfairly known as a one-hit wonder for his noi...,Arts,bbc_four,False,14 Jan 2007,3524,from-the-archives


In [136]:
from datetime import datetime
format = "%Y"
dates = [datetime.strptime(dates[-4:], format) for dates in df_BBC.release_date]

In [149]:
pd.date_range(min(dates), max(dates), freq = "Y").tolist()[0]

Timestamp('1933-12-31 00:00:00', freq='A-DEC')

In [151]:
list(get_recommendations("Planet Earth II", cosine_sim, indices).index)

[312, 338, 382, 360, 311, 358, 410, 294, 418]

In [170]:
cosine_sim[:, ind] += 0.1

In [171]:
cosine_sim[:, ind]

array([[2.03, 2.08, 2.07, ..., 2.07, 2.07, 2.1 ],
       [2.02, 2.03, 2.03, ..., 2.02, 2.03, 2.03],
       [2.02, 2.03, 2.03, ..., 2.02, 2.03, 2.02],
       ...,
       [2.04, 2.09, 2.08, ..., 2.07, 2.09, 2.07],
       [2.02, 2.04, 2.03, ..., 2.02, 2.04, 2.04],
       [2.04, 2.12, 2.1 , ..., 2.09, 2.1 , 2.08]])