In [16]:
import pandas as pd
from gensim.models import Word2Vec

# Loading the Dataset

In [17]:
csv_lst = ['s1.csv', 's2.csv', 's3.csv', 's4.csv', 's5.csv', 's6.csv', 'interactive_film.csv', 'special.csv']

df_lst = []

for i in range(len(csv_lst)):
    df_lst.append(pd.read_csv('data/' + csv_lst[i]))

s1, s2, s3, s4, s5, s6, interactive_film, special = df_lst[0], df_lst[1], df_lst[2], df_lst[3], df_lst[4], df_lst[5], df_lst[6], df_lst[7]

In [18]:
df = pd.concat([s1, s2, s3, s4, s5, s6, interactive_film, special], ignore_index=True)
df.head(5)

Unnamed: 0,Episode,Text
0,The National Anthem,"\n\n""The National Anthem"" is the series premie..."
1,Fifteen Million Merits,"\n\n""Fifteen Million Merits"" is the second epi..."
2,The Entire History of You,"\n\n""The Entire History of You"" is the third a..."
3,Be Right Back,"\n\n""Be Right Back"" is the first episode of th..."
4,White Bear,"\n\n""White Bear"" is the second episode of the ..."


# Training the Model

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [34]:
# Step 3: Define key terms
key_terms = ['woman', 'protagonist']

# Step 5: Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Step 6: Fit the vectorizer on the preprocessed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

# Step 7: Transform key terms
key_terms_vector = tfidf_vectorizer.transform(key_terms)

# Step 8: Calculate cosine similarity
cosine_similarities = linear_kernel(key_terms_vector, tfidf_matrix)

# Step 9: Create a DataFrame with episode names and their similarity scores
similarity_df = pd.DataFrame({'Episode': df['Episode'], 'similarity': cosine_similarities[0]})

# ... (Previous code)

# Step 10: Sort by similarity in descending order
sorted_df = similarity_df.sort_values(by='similarity', ascending=False)

print("Sorted DataFrame:")
print(sorted_df)

# Step 11: Choose a similarity threshold (e.g., 0.2)
threshold = 0.2
relevant_episodes = sorted_df[sorted_df['similarity'] >= threshold]

relevant_episodes

Sorted DataFrame:
                        Episode  similarity
4                    White Bear    0.021428
14                    Crocodile    0.020418
18              Striking Vipers    0.012277
10             Men Against Fire    0.012063
21                Joan Is Awful    0.008005
13                     Arkangel    0.006475
25                     Demon 79    0.006454
1        Fifteen Million Merits    0.006072
11          Hated in the Nation    0.004432
12                USS Callister    0.004393
24                    Mazey Day    0.004272
0           The National Anthem    0.004068
8             Shut Up and Dance    0.003744
27              White Christmas    0.003734
7                      Playtest    0.003624
23               Beyond the Sea    0.003365
19                  Smithereens    0.003215
9                  San Junipero    0.003012
3                 Be Right Back    0.002924
6                      Nosedive    0.002816
20  Rachel, Jack and Ashley Too    0.002648
15            

Unnamed: 0,Episode,similarity


# Training the Model

In [90]:
sentences = [text.split() for text in df['Text']]
sentences[0]

['"The',
 'National',
 'Anthem"',
 'is',
 'the',
 'series',
 'premiere',
 'of',
 'the',
 'British',
 'science',
 'fiction',
 'anthology',
 'series',
 'Black',
 'Mirror.',
 'Written',
 'by',
 'series',
 'creator',
 'and',
 'showrunner',
 'Charlie',
 'Brooker,',
 'it',
 'was',
 'directed',
 'by',
 'Otto',
 'Bathurst',
 'and',
 'first',
 'aired',
 'on',
 'Channel',
 '4',
 'on',
 '4',
 'December',
 '2011.',
 'In',
 'the',
 'episode,',
 'a',
 'member',
 'of',
 'the',
 'British',
 'royal',
 'family',
 'is',
 'kidnapped',
 'and',
 'will',
 'only',
 'be',
 'released',
 'if',
 'the',
 'British',
 'prime',
 'minister',
 'Michael',
 'Callow',
 '(Rory',
 'Kinnear)',
 'has',
 'sexual',
 'intercourse',
 'with',
 'a',
 'pig',
 'on',
 'live',
 'television.',
 'Scenes',
 'follow',
 'government',
 'attempts',
 'to',
 'track',
 'the',
 'kidnapper,',
 'news',
 'coverage',
 'of',
 'the',
 'unfolding',
 'events',
 'and',
 'public',
 'reaction.',
 '"The',
 'National',
 'Anthem"',
 'had',
 'several',
 'inspir

In [91]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)