In [2]:
import pandas as pd
import fasttext
import string
import re
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../datasets/mpst_full_data.csv")
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [4]:
df = df.drop(["imdb_id", "title", "synopsis_source", "split"], axis=1)

In [5]:
df.isnull().values.any()

False

In [6]:
df['tags'].values[0]

'cult, horror, gothic, murder, atmospheric'

In [7]:
def preprocess_tags(text):
    text = text.replace(', ', ' __label__')
    return f"__label__{text}"

In [8]:
df['tags'] = df['tags'].map(preprocess_tags)
# preprocess_tags("cult, romance, value")

In [9]:
df['tags'][0]

'__label__cult __label__horror __label__gothic __label__murder __label__atmospheric'

In [10]:
def preprocess_synopsis(text):
    # text = text.replace("\\n", "")
    # return text
    # text = text.replace(",", "")
    # text = text.replace("'", "")
    # text = re.sub(r'[^\w\s\']', '', text)
    for punctuation in string.punctuation:
        if punctuation in text:
            text = text.replace(punctuation, "")
    text = text.replace("\n", "")
    return text.strip().lower()

In [11]:
df["plot_synopsis"] = df["plot_synopsis"].map(preprocess_synopsis)

In [12]:
df.head()

Unnamed: 0,plot_synopsis,tags
0,note this synopsis is for the orginal italian ...,__label__cult __label__horror __label__gothic ...
1,two thousand years ago nhagruul the foul a sor...,__label__violence
2,matuscheks a gift store in budapest is the wor...,__label__romantic
3,glenn holland not a morning person by anyones ...,__label__inspiring __label__romantic __label__...
4,in may 1980 a cuban man named tony montana al ...,__label__cruelty __label__murder __label__dram...


In [13]:
train, test = train_test_split(df) 

In [14]:
train.to_csv("movie.train", columns=['tags', 'plot_synopsis'], sep='\t', index=False, header=False)
train.to_csv("movie.test", columns=['tags', 'plot_synopsis'], sep='\t', index=False, header=False)

In [19]:
model = fasttext.train_supervised(input="movie.train", epoch=25)
model.test("movie.test")

Read 9M words
Number of words:  211436
Number of labels: 69
Progress: 100.0% words/sec/thread: 2118709 lr:  0.000000 avg.loss:  3.407887 ETA:   0h 0m 0s


(11121, 0.5114647963312652, 0.17085185630181424)

In [20]:
model.get_nearest_neighbors("police")

[(0.9985209107398987, 'murdered'),
 (0.9981261491775513, 'crime'),
 (0.9959052205085754, 'killer'),
 (0.9951136708259583, 'murders'),
 (0.994271993637085, 'victim'),
 (0.9935495257377625, 'evidence'),
 (0.9932663440704346, 'shot'),
 (0.9932001829147339, 'death'),
 (0.9931970834732056, 'victims'),
 (0.9914525151252747, 'jim')]