In [22]:
import nltk
import os
import sys
import math
import pandas as pd
import spacy

In [16]:
MPST_DATASET = "../dataset/mpts/mpst_full_data.csv"
FUNCTIONAL_WORDS_FILE = "../dataset/function_words.txt"

In [37]:
def load_data(file):
    dataframe = pd.read_csv(file)
    all_categories = extract_categories(dataframe)
    movies_and_categories = {}
    for category in all_categories:
        category_movies = []
        category_data = dataframe[dataframe["tags"].str.contains(category)][["title", "plot_synopsis"]].values.tolist()
        for film in category_data:
            film_dict = {"title": film[0], "synopsis": film[1]}
            category_movies.append(film_dict)
        movies_and_categories[category] = category_movies
    return movies_and_categories

def extract_categories(df):
    raw_categories = [[cat for cat in cats.split(", ")] for cats in df["tags"].tolist()]
    categories = []
    for sublist in raw_categories:
        for category in sublist:
            categories.append(category)
    categories = set(categories)
    return categories

def load_functional_words():
    with open(FUNCTIONAL_WORDS_FILE) as f:
        function_words = set(f.read().splitlines())
    return function_words

def get_n_categories(films_with_categories, n):
    categories = list(films_with_categories.keys())
    nb_fils_cat = {}
    for cat in categories:
        nb_films = len(films_with_categories[cat])
        nb_fils_cat[cat] = nb_films
    sorted_cats = dict(sorted(nb_fils_cat.items(), key=lambda item: item[1], reverse=True))
    n_cats = list(sorted_cats.keys())[:n]
    return n_cats


def count_words(films_with_categories, categories):
    number_of_seen_synopsis = 0
    function_words = load_functional_words()
    nlp_en = spacy.load('en_core_web_lg')
    
    titles = {}
    words_count = {}
    
    for category in categories:
        category_titles = []
        for synopsis in films_with_categories[category]:
            doc_en = nlp_en(synopsis["synopsis"])
            number_of_seen_synopsis += 1
            print(number_of_seen_synopsis)
            category_titles.append(synopsis["title"])
            
            contents = [
                word.lower() for word in
                nltk.word_tokenize(synopsis["synopsis"])
                if word.isalpha()
            ]
            if category not in words_count.keys():
                words_count[category] = {}
            
            for word in contents:
                for ent in doc_en.ents:
                    if (word in ent.text) and (ent.label_ == 'PERSON'):
                        continue
                if word in function_words:
                    continue
                elif word not in words_count[category]:
                    words_count[category][word] = 1
                else:
                    words_count[category][word] += 1
        titles[category] = category_titles
    return words_count, titles

In [None]:
all_films = load_data(MPST_DATASET)
categories = get_n_categories(all_films, 10)

words_counts, titles = count_words(all_films, categories)


I tre volti della paura
1
Scarface
2


In [21]:
words_counts["suspenseful"]

{'kyle': 182,
 'pratt': 6,
 'jodie': 9,
 'foster': 63,
 'propulsion': 11,
 'engineer': 72,
 'based': 73,
 'berlin': 44,
 'germany': 61,
 'husband': 409,
 'david': 387,
 'john': 1029,
 'benjamin': 12,
 'hickey': 1,
 'died': 313,
 'falling': 108,
 'roof': 235,
 'avionic': 1,
 'manufacturing': 4,
 'building': 732,
 'six': 171,
 'daughter': 594,
 'julia': 156,
 'marlene': 5,
 'lawston': 1,
 'flying': 148,
 'home': 1434,
 'long': 413,
 'island': 206,
 'bury': 34,
 'stay': 387,
 'parents': 284,
 'fly': 138,
 'aboard': 150,
 'passenger': 90,
 'aircraft': 118,
 'elgin': 2,
 'helped': 78,
 'design': 24,
 'asleep': 99,
 'hours': 199,
 'wakes': 287,
 'find': 1744,
 'missing': 286,
 'trying': 555,
 'remain': 84,
 'calm': 90,
 'begins': 1062,
 'panic': 125,
 'captain': 468,
 'marcus': 154,
 'rich': 88,
 'sean': 73,
 'bean': 6,
 'forced': 298,
 'conduct': 37,
 'search': 317,
 'walks': 658,
 'aisles': 1,
 'questioning': 71,
 'people': 909,
 'fellow': 145,
 'passengers': 157,
 'remember': 119,
 'havin

In [6]:
function_words = load_functional_words()
categories = list(all_films.keys())

s = 0

for category in categories:
    s += len(all_films[category])
s

44462

In [22]:
all_films.keys()

dict_keys(['non fiction', 'comedy', 'adult comedy', 'plot twist', 'grindhouse film', 'romantic', 'home movie', 'thought-provoking', 'dramatic', 'violence', 'dark', 'christian film', 'storytelling', 'suicidal', 'feel-good', 'gothic', 'allegory', 'paranormal', 'brainwashing', 'cruelty', 'realism', 'psychological', 'suspenseful', 'insanity', 'inspiring', 'boring', 'satire', 'sadist', 'psychedelic', 'tragedy', 'entertaining', 'stupid', 'sci-fi', 'anti war', 'murder', 'action', 'avant garde', 'good versus evil', 'historical', 'horror', 'queer', 'claustrophobic', 'whimsical', 'blaxploitation', 'philosophical', 'magical realism', 'flashback', 'mystery', 'prank', 'haunting', 'melodrama', 'neo noir', 'fantasy', 'absurd', 'depressing', 'atmospheric', 'intrigue', 'bleak', 'clever', 'cute', 'historical fiction', 'cult', 'revenge', 'autobiographical', 'humor', 'pornographic', 'alternate reality', 'western', 'comic', 'alternate history', 'sentimental'])