In [48]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kartikeyadatta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Model

In [5]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else "cpu"

In [6]:
def load_model(device):
    theme_classifier = pipeline(
        "zero-shot-classification",
        model=model_name,
        device = device,
        framework="pt"
    )
    return theme_classifier

In [7]:
theme_classifier = load_model(device)

Device set to use cpu


In [8]:
theme_list = ["friendhip", "hope", "sacrifice", "Loss", "battle", "self developmet", "betrayal", "love", "dialogue"]

In [9]:
theme_classifier(
    "I gave him a right hook to the jaw.",
    theme_list,
    multilabel=True
)

{'sequence': 'I gave him a right hook to the jaw.',
 'labels': ['battle',
  'self developmet',
  'Loss',
  'sacrifice',
  'dialogue',
  'friendhip',
  'hope',
  'betrayal',
  'love'],
 'scores': [0.3110542893409729,
  0.2908153235912323,
  0.14647865295410156,
  0.07516748458147049,
  0.06345169991254807,
  0.055162619799375534,
  0.024390211328864098,
  0.023432796820998192,
  0.010046899318695068]}

### Loading the  Dataset

In [11]:
files = glob('../data/Subtitles/*.ass')

In [13]:
files[:5]

['../data/Subtitles/Naruto Season 4 - 94.ass',
 '../data/Subtitles/Naruto Season 4 - 80.ass',
 '../data/Subtitles/Naruto Season 2 - 32.ass',
 '../data/Subtitles/Naruto Season 8 - 185.ass',
 '../data/Subtitles/Naruto Season 8 - 191.ass']

- Lets read the lines form the subtitles

In [21]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:] # skip the first 27 lines as the dialogues in the subtitle folder are starting from line 27.
    lines = [','.join(line.split(',')[9:]) for line in lines]
    

In [22]:
lines[:2]

['We are Fighting Dreamers aiming high\n',
 "Fighting Dreamers\\Ndon't care what people think about them\n"]

In [24]:
lines = [ line.replace('\\N' , ' ')for line in lines]

In [25]:
lines[:2]

['We are Fighting Dreamers aiming high\n',
 "Fighting Dreamers don't care what people think about them\n"]

- As theme classifier has a maximum of 512 tokes we can not feed the whole subtitle to the model. We need to split the subtitle into chunks of data. 

In [26]:
" ".join(lines[:10])

"We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map?\n An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!)\n"

- Now lets try to get the episode number of our series.

In [29]:
int(files[0].split('-')[-1].split('.')[0].strip())

94

In [None]:
def load_subtitles_dataset(dataset_path):
    subtitles_path = glob(dataset_path+'/*.ass')
    
    scripts = []
    episode_num = []
    for path in subtitles_path:
        with open(path, 'r') as file:
            lines = file.readlines()
            lines = lines[27:] # skip the first 27 lines as the dialogues in the subtitle folder are starting from line 27.
            lines = [','.join(line.split(',')[9:]) for line in lines]
            
        lines = [ line.replace('\\N' , ' ')for line in lines]
        script = " ".join(lines)
        
        episode = int(path.split('-')[-1].split('.')[0].strip())
        
        scripts.append(script)
        episode_num.append(episode)
        
    df = pd.DataFrame.from_dict({"episode":episode_num, "script":scripts})
    return df

In [35]:
dataset_path = "../data/Subtitles"
df = load_subtitles_dataset(dataset_path)

In [36]:
df.head()

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,We are Fighting Dreamers aiming high\n Fightin...
3,185,We are Fighting Dreamers aiming high\n Fightin...
4,191,We are Fighting Dreamers aiming high\n Fightin...


### Lets try running our model

In [38]:
script = df.iloc[0]['script']
print(script)

We are Fighting Dreamers aiming high
 Fighting Dreamers don't care what people think about them
 Fighting Dreamers follow what they believe
 Oli Oli Oli Oh! Just go my way
 Right here right now (Bang) Hit it straight like a line drive!
 Right here right now (Burn)
 Down a difficult road filled with endless struggles
 Where do you think you are going following someone else's map?
 An insightful crow comes along to tear up the map
 Now open your eyes and take a look at the truth (Yeah!)
 There's nothing to lose, so let's GO!!!
 We are Fighting Dreamers aiming high
 Fighting Dreamers don't care what people think about them
 Fighting Dreamers follow what they believe
 Oli Oli Oli Oh! Just go my way
 Right here right now (Bang) Hit it straight like a line drive!
 Right here right now (Burn) We're gonna do it and do our best!
 Right here right now (Bang) Hit it straight like a line drive!
 Right here right now (Burn) We're gonna do it and do our best! BANG!
 My body movements have finally re

In [40]:

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kartikeyadatta/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [41]:
script_sentences = sent_tokenize(script)
script_sentences[:3]

["We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh!",
 'Just go my way\n Right here right now (Bang) Hit it straight like a line drive!',
 "Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map?"]

#### Batching the sentences with size of 20

In [42]:
sentence_batch_size = 20
script_batches = []
for index in range(0, len(script_sentences), sentence_batch_size):
    sent = " ".join(script_sentences[index:index+sentence_batch_size])
    script_batches.append(sent)

In [43]:
script_batches[:3]

["We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map? An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!) There's nothing to lose, so let's GO!!! We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! BANG! My body movements have final

In [45]:
theme_output = theme_classifier(
    script_batches[:2], # taking the first two batches of sentences for classification
    theme_list,
    multilabel=True
)
theme_output

[{'sequence': "We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map? An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!) There's nothing to lose, so let's GO!!! We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! Right here right now (Bang) Hit it straight like a line drive! Right here right now (Burn) We're gonna do it and do our best! BANG! My body movemen

In [46]:
# Wrangle output into to batches of themes
themes ={}
for output in theme_output:
    for label, score in zip(output['labels'], output['scores']):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [47]:
themes

{'dialogue': [0.22415779531002045, 0.1533605307340622],
 'battle': [0.21036116778850555, 0.21006415784358978],
 'sacrifice': [0.13561369478702545, 0.25403621792793274],
 'betrayal': [0.127573624253273, 0.09080351889133453],
 'self developmet': [0.10827678442001343, 0.08029769361019135],
 'Loss': [0.06931781768798828, 0.044582854956388474],
 'friendhip': [0.0547311045229435, 0.1033640205860138],
 'hope': [0.05393011122941971, 0.041515812277793884],
 'love': [0.016037868335843086, 0.02197522670030594]}

In [None]:
def get_themes_inference(script):
    script_sentences = sent_tokenize(script)
    
    sentence_batch_size = 20
    script_batches = []
    for index in range(0, len(script_sentences), sentence_batch_size):
        sent = " ".join(script_sentences[index:index+sentence_batch_size])
        script_batches.append(sent)
        
    # Run the model
    theme_output = theme_classifier(
        script_batches[:],
        theme_list,
        multilabel=True
    )
    theme_output
    
    # Wrangle output into to batches of themes
    themes ={}
    for output in theme_output:
        for label, score in zip(output['labels'], output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)
            
    themes = {key: np.mean(np.array(value)) for key,value in themes.items()}
    return themes

In [60]:
output_themes = df['script'].apply(get_themes_inference)

In [61]:
output_themes

0    {'dialogue': 0.18875916302204132, 'battle': 0....
1    {'dialogue': 0.18875916302204132, 'battle': 0....
Name: script, dtype: object

In [63]:
print(pd.DataFrame(output_themes.tolist()).head())
theme_df = pd.DataFrame(output_themes.tolist())

   dialogue    battle  sacrifice  betrayal  self developmet     Loss  \
0  0.188759  0.210213   0.194825  0.109189         0.094287  0.05695   
1  0.188759  0.210213   0.194825  0.109189         0.094287  0.05695   

   friendhip      hope      love  
0   0.079048  0.047723  0.019007  
1   0.079048  0.047723  0.019007  


In [64]:
theme_df

Unnamed: 0,dialogue,battle,sacrifice,betrayal,self developmet,Loss,friendhip,hope,love
0,0.188759,0.210213,0.194825,0.109189,0.094287,0.05695,0.079048,0.047723,0.019007
1,0.188759,0.210213,0.194825,0.109189,0.094287,0.05695,0.079048,0.047723,0.019007


In [65]:
df[theme_df.columns] = theme_df
df

Unnamed: 0,episode,script,dialogue,battle,sacrifice,betrayal,self developmet,Loss,friendhip,hope,love
0,94,We are Fighting Dreamers aiming high\n Fightin...,0.188759,0.210213,0.194825,0.109189,0.094287,0.05695,0.079048,0.047723,0.019007
1,80,We are Fighting Dreamers aiming high\n Fightin...,0.188759,0.210213,0.194825,0.109189,0.094287,0.05695,0.079048,0.047723,0.019007
