# Cornell movies dialogues dataset

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
DATA_DIR = "../data/cornell_movie_dialogs_corpus"
print(os.listdir(DATA_DIR))

# Any results you write to the current directory are saved as output.

['movie_conversations.txt', 'raw_script_urls.txt', 'movie_lines.txt', 'README.txt', 'chameleons.pdf', 'movie_titles_metadata.txt', 'movie_characters_metadata.txt', '.DS_Store']


In [29]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd

In [30]:
!cat ../data/cornell_movie_dialogs_corpus/README.txt

Cornell Movie-Dialogs Corpus

Distributed together with:

"Chameleons in imagined conversations: A new approach to understanding coordination of linguistic style in dialogs"
Cristian Danescu-Niculescu-Mizil and Lillian Lee
Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics, ACL 2011.

(this paper is included in this zip file)

NOTE: If you have results to report on these corpora, please send email to cristian@cs.cornell.edu or llee@cs.cornell.edu so we can add you to our list of people using this data.  Thanks!


Contents of this README:

	A) Brief description
	B) Files description
	C) Details on the collection procedure
	D) Contact


A) Brief description:

This corpus contains a metadata-rich collection of fictional conversations extracted from raw movie scripts:

- 220,579 conversational exchanges between 10,292 pairs of movie characters
- involves 9,035 characters from 617 movies
- in total 304,713 utterances
- movie metadata

In [31]:
!head -n 1000 ../input/movie_lines.txt

head: no se puede abrir '../input/movie_lines.txt' para lectura: No existe el archivo o el directorio


__Parse dialogue lines into pandas data frame__

In [32]:
def split_line(s_line, delim):
    res = []
    _buff = s_line
    while delim in _buff:
        _index = _buff.index(delim)
        res.append(_buff[:_index].strip())
        _buff = _buff[_index+len(delim):].strip()
    res.append(_buff)
    return res

# Test
# split_line("L578 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I believe we share an art instructor", "+++$+++")

In [33]:
def movie_lines_to_dataframe(file_path):
    """
    - movie_lines.txt
        - contains the actual text of each utterance
        - fields:
            - lineID
            - characterID (who uttered this phrase)
            - movieID
            - character name
            - text of the utterance
    """
    DELIM = "+++$+++"
    line_ids = []
    character_ids = []
    movie_ids = []
    character_names = []
    utterances = []    
    with open(file_path, encoding="utf-8", errors="ignore") as f:
        for l in tqdm(f.readlines()):
            _parsed_line = split_line(l, DELIM)
            line_ids.append(_parsed_line[0])
            character_ids.append(_parsed_line[1])
            movie_ids.append(_parsed_line[2])
            character_names.append(_parsed_line[3])
            utterances.append(_parsed_line[4])
    res = pd.DataFrame()
    res["LINE_ID"] = line_ids
    res["CHARACTER_ID"] = character_ids
    res["MOVIE_ID"] = movie_ids
    res["CHARACTER_NAME"] = character_names
    res["UTTERANCE"] = utterances
    return res
    

In [34]:
df_movie_lines = movie_lines_to_dataframe(DATA_DIR + "/movie_lines.txt")
# Save to disk
df_movie_lines.to_csv("movie_lines.csv")
df_movie_lines.to_pickle("movie_lines.pick")

Widget Javascript not detected.  It may not be installed or enabled properly.





In [35]:
df_movie_lines.dtypes

LINE_ID           object
CHARACTER_ID      object
MOVIE_ID          object
CHARACTER_NAME    object
UTTERANCE         object
dtype: object

In [36]:
!ls -alh

total 53M
drwxr-xr-x 3 wotan wotan 4,0K abr 14 15:14 .
drwxr-xr-x 4 wotan wotan 4,0K abr 14 14:55 ..
-rw-r--r-- 1 wotan wotan 131K abr 14 15:14 Cornell_movies_dialogues_database_preprocessng.ipynb
drwxr-xr-x 2 wotan wotan 4,0K abr 14 15:00 .ipynb_checkpoints
-rw-r--r-- 1 wotan wotan  26M abr 14 15:14 movie_lines.csv
-rw-r--r-- 1 wotan wotan  27M abr 14 15:14 movie_lines.pick


__Translate to Spanish__

In [37]:
# Count number of characters in utterances
def count_chars(texts_list):
    count = 0
    for l in texts_list:
        count += len(l)
    return count

count_chars(df_movie_lines.UTTERANCE.values)

16838300

In [38]:
# Get the first 2 million chars
def get_max_index(texts_list, max_length, start_index=0):
    count = 0
    index = start_index
    for l in texts_list[start_index:]:
        count += len(l)
        if count >= max_length:
            return index
        index += 1
    return index


In [39]:
# Get first 2 million characters index
_start_index = 0
index = get_max_index(df_movie_lines.UTTERANCE.values, 2e6, start_index=_start_index)
count_chars(df_movie_lines.UTTERANCE.values[_start_index:index])

1999849

In [40]:
# Split dataframe into <2 million chars chunks
split_indices = []
CHARS_PER_CHUNK = 2e6

_split_index = 0
_chars_len = count_chars(df_movie_lines.UTTERANCE.values[_split_index:])
while (_chars_len > CHARS_PER_CHUNK):
    _split_index = get_max_index(df_movie_lines.UTTERANCE.values, CHARS_PER_CHUNK, start_index=_split_index)
    _chars_len = count_chars(df_movie_lines.UTTERANCE.values[_split_index:])
    split_indices.append(_split_index)
if (split_indices[-1] < len(df_movie_lines)-1):
    split_indices.append(len(df_movie_lines)-1)

# Indices to split the dataframe
print("Split indices: {}".format(split_indices))

for _i, _index in enumerate(split_indices):
    if _i > 0:
        print("{} - {} num chars: {}".format(split_indices[_i-1], _index, count_chars(df_movie_lines.UTTERANCE.values[split_indices[_i-1]:_index])))


Split indices: [36222, 72819, 108282, 144941, 182428, 218582, 254564, 290490, 304712]
36222 - 72819 num chars: 1999754
72819 - 108282 num chars: 1999989
108282 - 144941 num chars: 1999953
144941 - 182428 num chars: 1999930
182428 - 218582 num chars: 1999990
218582 - 254564 num chars: 1999873
254564 - 290490 num chars: 1999961
290490 - 304712 num chars: 838927


In [42]:
MOVIE_LINES_CHUNK_PREFIX = "../preprocessed_data/movie_lines_chunk_{}"
for i, index in enumerate(split_indices):
    if i > 0:
        _df = df_movie_lines.loc[split_indices[i-1]:index]
        _df.to_csv(MOVIE_LINES_CHUNK_PREFIX.format(i) + ".csv")
        _df.to_pickle(MOVIE_LINES_CHUNK_PREFIX.format(i) + ".pick")
        

In [44]:
!ls ../preprocessed_data -alh

total 47M
drwxr-xr-x 2 wotan wotan 4,0K abr 14 15:56 .
drwxr-xr-x 5 wotan wotan 4,0K abr 14 15:56 ..
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_1.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_1.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_2.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_2.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_3.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_3.pick
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_4.csv
-rw-r--r-- 1 wotan wotan 3,3M abr 14 15:56 movie_lines_chunk_4.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_5.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_5.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_6.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 15:56 movie_lines_chunk_6.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 15:56 movie_lines_chunk_7.csv
-rw-r-