In [5]:
import nltk
import os
import sys
import math
import pandas as pd
import spacy
import re

In [6]:
MPST_DATASET = "../dataset/mpts/mpst_full_data.csv"
FUNCTIONAL_WORDS_FILE = "../dataset/function_words.txt"

In [7]:
def load_data(file):
    dataframe = pd.read_csv(file)
    all_categories = extract_categories(dataframe)
    movies_and_categories = {}
    for category in all_categories:
        category_movies = []
        category_data = dataframe[dataframe["tags"].str.contains(category)][["title", "plot_synopsis"]].values.tolist()
        for film in category_data:
            film_dict = {"title": film[0], "synopsis": film[1]}
            category_movies.append(film_dict)
        movies_and_categories[category] = category_movies
    return movies_and_categories

def extract_categories(df):
    raw_categories = [[cat for cat in cats.split(", ")] for cats in df["tags"].tolist()]
    categories = []
    for sublist in raw_categories:
        for category in sublist:
            categories.append(category)
    categories = set(categories)
    return categories

def get_n_categories(films_with_categories, n):
    categories = list(films_with_categories.keys())
    nb_fils_cat = {}
    for cat in categories:
        nb_films = len(films_with_categories[cat])
        nb_fils_cat[cat] = nb_films
    sorted_cats = dict(sorted(nb_fils_cat.items(), key=lambda item: item[1], reverse=True))
    n_cats = list(sorted_cats.keys())[:n]
    return n_cats


In [8]:
all_films = load_data(MPST_DATASET)
## We'll only work on the 5 most represented categories
n_categories = get_n_categories(all_films, 5)
n_categories_films = { category: all_films[category] for category in n_categories }

In [9]:
for category in n_categories_films.keys():
    print(f"{category} - {len(n_categories_films[category])} films")

murder - 5782 films
violence - 4426 films
flashback - 2937 films
romantic - 2906 films
cult - 2647 films


## /!\ Deprecated /!\ - Generate dataset to markovify intros

In [34]:
for category in n_categories_films.keys():
    category_file = f"../dataset/markovify_train/{category}.txt"
    file = open(category_file, 'a')
    nb_films = len(n_categories_films[category])
    for i in range(nb_films):
        synopsis_intro = ".".join(n_categories_films[category][i]["synopsis"].replace("\n", " ").split(".")[:3])
        file.write(n_categories_films[category][i]["synopsis"].replace("\n", " "))
        file.write("\n")
    file.close()

## Adaptation of the dataset to create corpus for each of the 5 major movie categories of the MPTS dataset
### This Notebook is also used to clean the dataset from all actors names specifically, and all parenthesis content as they were causing learning problems

In [10]:
for category in n_categories_films.keys():
    nb_films = len(n_categories_films[category])
    for i in range(nb_films):
        film_file = f"../dataset/transformers_train/{category}/{i}.txt"
        file = open(film_file, 'w')
        clean_synopsis = re.sub(r'\([^)]*\)', '', n_categories_films[category][i]["synopsis"])
        file.write(clean_synopsis)
    file.close()