In [32]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kartikeyadatta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Model

In [5]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else "cpu"

In [6]:
def load_model(device):
    theme_classifier = pipeline(
        "zero-shot-classification",
        model=model_name,
        device = device,
        framework="pt"
    )
    return theme_classifier

In [7]:
theme_classifier = load_model(device)

Device set to use cpu


In [8]:
theme_list = ["friendhip", "hope", "sacrifice", "Loss", "battle", "self developmet", "betrayal", "love", "dialogue"]

In [9]:
theme_classifier(
    "I gave him a right hook to the jaw.",
    theme_list,
    multilabel=True
)

{'sequence': 'I gave him a right hook to the jaw.',
 'labels': ['battle',
  'self developmet',
  'Loss',
  'sacrifice',
  'dialogue',
  'friendhip',
  'hope',
  'betrayal',
  'love'],
 'scores': [0.3110542893409729,
  0.2908153235912323,
  0.14647865295410156,
  0.07516748458147049,
  0.06345169991254807,
  0.055162619799375534,
  0.024390211328864098,
  0.023432796820998192,
  0.010046899318695068]}

### Loading the  Dataset

In [11]:
files = glob('../data/Subtitles/*.ass')

In [13]:
files[:5]

['../data/Subtitles/Naruto Season 4 - 94.ass',
 '../data/Subtitles/Naruto Season 4 - 80.ass',
 '../data/Subtitles/Naruto Season 2 - 32.ass',
 '../data/Subtitles/Naruto Season 8 - 185.ass',
 '../data/Subtitles/Naruto Season 8 - 191.ass']

- Lets read the lines form the subtitles

In [21]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:] # skip the first 27 lines as the dialogues in the subtitle folder are starting from line 27.
    lines = [','.join(line.split(',')[9:]) for line in lines]
    

In [22]:
lines[:2]

['We are Fighting Dreamers aiming high\n',
 "Fighting Dreamers\\Ndon't care what people think about them\n"]

In [24]:
lines = [ line.replace('\\N' , ' ')for line in lines]

In [25]:
lines[:2]

['We are Fighting Dreamers aiming high\n',
 "Fighting Dreamers don't care what people think about them\n"]

- As theme classifier has a maximum of 512 tokes we can not feed the whole subtitle to the model. We need to split the subtitle into chunks of data. 

In [26]:
" ".join(lines[:10])

"We are Fighting Dreamers aiming high\n Fighting Dreamers don't care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else's map?\n An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!)\n"

- Now lets try to get the episode number of our series.

In [29]:
int(files[0].split('-')[-1].split('.')[0].strip())

94

In [34]:
def load_subtitles_dataset(dataset_path):
    subtitles_path = glob(dataset_path+'/*.ass')
    
    scripts = []
    episode_num = []
    for path in subtitles_path:
        with open(files[0], 'r') as file:
            lines = file.readlines()
            lines = lines[27:] # skip the first 27 lines as the dialogues in the subtitle folder are starting from line 27.
            lines = [','.join(line.split(',')[9:]) for line in lines]
            
        lines = [ line.replace('\\N' , ' ')for line in lines]
        script = " ".join(lines)
        
        episode = int(path.split('-')[-1].split('.')[0].strip())
        
        scripts.append(script)
        episode_num.append(episode)
        
    df = pd.DataFrame.from_dict({"episode":episode_num, "script":scripts})
    return df

In [35]:
dataset_path = "../data/Subtitles"
df = load_subtitles_dataset(dataset_path)

In [36]:
df.head()

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,We are Fighting Dreamers aiming high\n Fightin...
3,185,We are Fighting Dreamers aiming high\n Fightin...
4,191,We are Fighting Dreamers aiming high\n Fightin...
