In [1]:
import numpy as np
import pandas as pd
import os
import re
import math
import random

from gensim.summarization import summarize

## Load datasets

In [2]:
# load datasets
data_path = 'data.csv'
df = pd.read_csv(data_path, sep='\t', encoding='utf-8')
print('total instances from original dataset:', df.shape[0])

total instances from original dataset: 32834


## Clean dataset

TODO: delete anything that starts with backslash \\[a-z]

In [3]:
# strip first plot synopsis from each synopsis
df['Synopsis'] = df['Synopsis'].str.replace(r'\[|\]|\'|\.|\"', '')
df['Synopsis'] = df['Synopsis'].str.split(':').str[0]
df['Synopsis'] = df['Synopsis'].str.lower()

# make everything lowercase
df['MovieTitle'] = df['MovieTitle'].str.lower()
df['MoviePlot'] = df['MoviePlot'].str.lower()

# clean movie plots
#df['MoviePlot'] = df['MoviePlot'].str.replace(r'\{{\w+}} |\{{\w+\ \w+\}} ', '')
#df['MoviePlot'] = df['MoviePlot'].str.replace(r' :|\= |\"|;|:|#', '')

df['MoviePlot'] = df['MoviePlot'].str.replace("-", " ")
df['Synopsis'] = df['Synopsis'].str.replace("-", " ")

df['MoviePlot'] = df['MoviePlot'].str.replace(r"[^a-z0-9,.\ ]", "")
df['MoviePlot'] = df['MoviePlot'].str.replace(r'\s+([?.!,"\ ])', r'\1')
df['MoviePlot'] = df['MoviePlot'].str.strip()

df['Synopsis'] = df['Synopsis'].str.replace(r"[^a-z0-9,.\ ]", "")
df['Synopsis'] = df['Synopsis'].str.replace(r'\s+([?.!,"\ ])', r'\1')
df['Synopsis'] = df['Synopsis'].str.strip()

df = df[~df['MoviePlot'].str.contains("http")]

# keep only synopsis where len < 280 chars
# synopsis must be longer than 25 chars
# movie plots must be 280 chars or more
MAX_LEN = 5000
MAX_LEN_SYNOPSIS = 280
MIN_LEN_SYNOPSIS = 25
df = df[df['Synopsis'].str.len() < MAX_LEN_SYNOPSIS]
df = df[df['Synopsis'].str.len() > MIN_LEN_SYNOPSIS]
df = df[df['MoviePlot'].str.len() < MAX_LEN]
df = df[df['MoviePlot'].str.len() > MAX_LEN_SYNOPSIS]

df.head(25)

Unnamed: 0,MovieId,MovieTitle,Year,MoviePlot,Synopsis
0,3217,army of darkness,1992,"after being pulled through a time portal, ash ...","a man is accidentally transported to 1300 ad, ..."
2,3746,blade runner,1982,"hatnote in los angeles, november 2019, retired...",a blade runner must pursue and terminate four ...
5,4227,barry lyndon,1975,by what means redmond barry acquired the style...,an irish rogue wins the heart of a rich widow ...
6,4231,buffy the vampire slayer,1992,buffy summers is introduced as a stereotypical...,a flighty teenage girl learns that she is her ...
7,4560,braveheart,1995,"in the 13th century, after several years of po...",when his secret bride is executed for assaulti...
8,4726,batman,1989,"as a child, bruce wayne witnesses his parents ...",the dark knight of gotham city begins his war ...
9,4727,batman,1966,when batman and robin get a tip that commodore...,the caped crusader and his young ward battle e...
10,4728,batman returns,1992,a deformed baby boy is thrown into gotham city...,when a corrupt businessman and the grotesque p...
11,4729,batman & robin,1997,"in gotham city, batman and robin attempt to st...",batman and robin try to keep their relationshi...
12,4730,batman forever,1995,"in gotham city, batman stops a hostage situati...",batman must battle former district attorney ha...


In [4]:
total = df.shape[0]
print('total instances from cleaned dataset:', total)

total instances from cleaned dataset: 19060


In [5]:
n = random.randint(0, total)
print(n)
print()
print(df['MoviePlot'].iloc[n])
print()
print(df['Synopsis'].iloc[n])

16625

in the early 1840s, wells fargo employee ramsay mackay comes upon a broken down carriage in the countryside and gives belle justine pryor and her mother a lift into buffalo, new york, though he warns them he is in a hurry to make a delivery of fresh oysters. the ladies endure a very bumpy ride, and he arrives in time to enable his employer, henry wells, to impress some bankers with the speed of his service. wells sends him to set up a branch office in st. louis, which is quite convenient, as the pryors reside there. mackay and justine begin seeing each other, though her mother disapproves, as does justines more socially prominent suitor, talbot carter. impressed with mackay, in 1846, wells sends him to open trails to california. mackay takes along hank york, a frontiersman who only works when he has to, and hanks constant indian companion, pawnee. among his many duties, mackay sets out to transport gold from a mining settlement to san francisco. one of his customers is prospecto

## Create Summaries for each summary

In [7]:
def augment_summarize(x):
    try:
        res = summarize(x)
        return res.replace('\n', ' ')
    except:
        return ""

In [8]:
df['MoviePlotSummary'] = df['MoviePlot'].apply(lambda x: augment_summarize(x))
df.head()

Unnamed: 0,MovieId,MovieTitle,Year,MoviePlot,Synopsis,MoviePlotSummary
0,3217,army of darkness,1992,"after being pulled through a time portal, ash ...","a man is accidentally transported to 1300 ad, ...","after being pulled through a time portal, ash ..."
2,3746,blade runner,1982,"hatnote in los angeles, november 2019, retired...",a blade runner must pursue and terminate four ...,deckard watches a video of another blade runne...
5,4227,barry lyndon,1975,by what means redmond barry acquired the style...,an irish rogue wins the heart of a rich widow ...,nora and her family plan to relieve their pove...
6,4231,buffy the vampire slayer,1992,buffy summers is introduced as a stereotypical...,a flighty teenage girl learns that she is her ...,"meanwhile oliver pike, and best friend benny, ..."
7,4560,braveheart,1995,"in the 13th century, after several years of po...",when his secret bride is executed for assaulti...,"when he returns home, wallace falls in love wi..."


# uncomment for summarized
df = df[df['MoviePlotSummary'].str.len() > 10]
total = df.shape[0]
print("updated total:", total)

In [14]:
n = random.randint(0, df.shape[0])
print(df['MoviePlotSummary'].iloc[n])
print()
print(df['Synopsis'].iloc[n])

en route home after a day trip to the familys summer cottage with his father, timmy purchases a bouquet of roses and suggests john present them to his wife.

a young man returning home from world war ii finds himself caught up in his parents turbulent relationship


## Partition Dataset

In [15]:
training = int(0.95 * total)
print("training size:", training)
testing = total - training
print("total testing size:", testing)
test = int(testing/2)
print("test size:", test)
cv = total - training - test
print("cv size:", cv)

training size: 15014
total testing size: 791
test size: 395
cv size: 396


In [16]:
s = random.sample(range(0, total-1), testing)
print("total len:", len(s))
for i in range(10):
    print(s[i])

total len: 791
12995
473
4761
13563
6383
10862
10171
1982
10480
12363


In [17]:
df_train = df.drop(df.index[s])
df_testing = df.iloc[s]
print(df_train.shape)
print(df_testing.shape)

(15014, 6)
(791, 6)


In [18]:
t = random.sample(range(0, testing-1), test)
print("total len:", len(t))
for i in range(10):
    print(t[i])

total len: 395
159
377
653
63
671
691
10
273
674
767


In [19]:
df_test = df_testing.iloc[t]
df_cv = df_testing.drop(df_testing.index[t])

In [20]:
df_test.head()

Unnamed: 0,MovieId,MovieTitle,Year,MoviePlot,Synopsis,MoviePlotSummary
9507,4451206,the hillz,2004,the film follows the experiences of a promisin...,a promising college athlete takes a turn for t...,after duff kills a kid who volunteers at the p...
952,202133,the rules of the game,1939,the film begins with the aviator andr jurieux ...,a bourgeois life in france at the onset of wor...,"he is greeted by his friend octave, who reveal..."
16335,10722657,kaun,1999,"in the malhotra household, a young woman is pr...","while alone in the house, a woman hears news o...","she is reluctant to open the door, even to tal..."
10884,5492073,deadly companion,1980,"michael taylor, played by michael sarrazin, is...",mentally tortured photojournalist attempts to ...,it is interesting to note that the cast of sct...
18661,13590180,magic kitchen,2004,yau became the proprietor and head chef of a s...,yau is on the lookout for love in modern day h...,yau cooks according to the large collection of...


In [21]:
df_cv.head()

Unnamed: 0,MovieId,MovieTitle,Year,MoviePlot,Synopsis,MoviePlotSummary
25416,24152307,bulldog drummond escapes,1937,captain high bulldog drummond has just returne...,captain drummond becomes a prisoner when he in...,"as he goes to investigate, the woman drives aw..."
7615,3257958,the santa clause 3: the escape clause,2006,"scott calvinsanta claus, and his wife, carol, ...","santa, aka scott calvin, is faced with double ...",jack frost causes trouble for the family that ...
10584,5274154,storm boy,1976,storm boy likes to wander alone along the fier...,mike is a lonely australian boy living in a co...,"after a pelican mother is shot, storm boy resc..."
20257,15945437,cannibal apocalypse,1980,the film opens with a flashback to the vietnam...,giovanni lambardo radice and john saxon are vi...,the film opens with a flashback to the vietnam...
2923,1011144,the farmer's wife,1928,"tibby, the wife of samuel sweetland dies, and ...","after his daughter weds, a middle aged widower...","tibby, the wife of samuel sweetland dies, and ..."


In [22]:
df_train.head()

Unnamed: 0,MovieId,MovieTitle,Year,MoviePlot,Synopsis,MoviePlotSummary
0,3217,army of darkness,1992,"after being pulled through a time portal, ash ...","a man is accidentally transported to 1300 ad, ...","after being pulled through a time portal, ash ..."
2,3746,blade runner,1982,"hatnote in los angeles, november 2019, retired...",a blade runner must pursue and terminate four ...,deckard watches a video of another blade runne...
5,4227,barry lyndon,1975,by what means redmond barry acquired the style...,an irish rogue wins the heart of a rich widow ...,nora and her family plan to relieve their pove...
6,4231,buffy the vampire slayer,1992,buffy summers is introduced as a stereotypical...,a flighty teenage girl learns that she is her ...,"meanwhile oliver pike, and best friend benny, ..."
7,4560,braveheart,1995,"in the 13th century, after several years of po...",when his secret bride is executed for assaulti...,"when he returns home, wallace falls in love wi..."


In [23]:
print(df_train.shape[0])
print(df_test.shape[0])
print(df_cv.shape[0])

15014
395
396


In [24]:
df.to_csv("./data/dataset.csv", sep='\t', encoding='utf-8', index=False)
df_train.to_csv("./data/train.csv", sep='\t', encoding='utf-8', index=False)
df_test.to_csv("./data/test.csv", sep='\t', encoding='utf-8', index=False)
df_cv.to_csv("./data/cv.csv", sep='\t', encoding='utf-8', index=False)