# Cornell movies dialogues dataset

See https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

### Preprocess

In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
DATA_DIR = "../data/cornell_movie_dialogs_corpus"
PREPROCESSED_DATA_DIR = "../preprocessed_data"
print(os.listdir(DATA_DIR))

# Any results you write to the current directory are saved as output.

['movie_conversations.txt', 'raw_script_urls.txt', 'movie_lines.txt', 'README.txt', 'chameleons.pdf', 'movie_titles_metadata.txt', 'movie_characters_metadata.txt', '.DS_Store']


In [2]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import time

In [3]:
!cat ../data/cornell_movie_dialogs_corpus/README.txt

Cornell Movie-Dialogs Corpus

Distributed together with:

"Chameleons in imagined conversations: A new approach to understanding coordination of linguistic style in dialogs"
Cristian Danescu-Niculescu-Mizil and Lillian Lee
Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics, ACL 2011.

(this paper is included in this zip file)

NOTE: If you have results to report on these corpora, please send email to cristian@cs.cornell.edu or llee@cs.cornell.edu so we can add you to our list of people using this data.  Thanks!


Contents of this README:

	A) Brief description
	B) Files description
	C) Details on the collection procedure
	D) Contact


A) Brief description:

This corpus contains a metadata-rich collection of fictional conversations extracted from raw movie scripts:

- 220,579 conversational exchanges between 10,292 pairs of movie characters
- involves 9,035 characters from 617 movies
- in total 304,713 utterances
- movie metadata

In [4]:
!head -n 1000 ../input/movie_lines.txt

head: no se puede abrir '../input/movie_lines.txt' para lectura: No existe el archivo o el directorio


__Parse dialogue lines into pandas data frame__

In [5]:
def split_line(s_line, delim):
    res = []
    _buff = s_line
    while delim in _buff:
        _index = _buff.index(delim)
        res.append(_buff[:_index].strip())
        _buff = _buff[_index+len(delim):].strip()
    res.append(_buff)
    return res

# Test
# split_line("L578 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I believe we share an art instructor", "+++$+++")

In [6]:
def movie_lines_to_dataframe(file_path):
    """
    - movie_lines.txt
        - contains the actual text of each utterance
        - fields:
            - lineID
            - characterID (who uttered this phrase)
            - movieID
            - character name
            - text of the utterance
    """
    DELIM = "+++$+++"
    line_ids = []
    character_ids = []
    movie_ids = []
    character_names = []
    utterances = []    
    with open(file_path, encoding="utf-8", errors="ignore") as f:
        for l in tqdm(f.readlines()):
            _parsed_line = split_line(l, DELIM)
            line_ids.append(_parsed_line[0])
            character_ids.append(_parsed_line[1])
            movie_ids.append(_parsed_line[2])
            character_names.append(_parsed_line[3])
            utterances.append(_parsed_line[4])
    res = pd.DataFrame()
    res["LINE_ID"] = line_ids
    res["CHARACTER_ID"] = character_ids
    res["MOVIE_ID"] = movie_ids
    res["CHARACTER_NAME"] = character_names
    res["UTTERANCE"] = utterances
    return res
    

In [7]:
df_movie_lines = movie_lines_to_dataframe(DATA_DIR + "/movie_lines.txt")
# Save to disk
df_movie_lines.to_csv("movie_lines.csv")
df_movie_lines.to_pickle("movie_lines.pick")

Widget Javascript not detected.  It may not be installed or enabled properly.





In [8]:
df_movie_lines.dtypes

LINE_ID           object
CHARACTER_ID      object
MOVIE_ID          object
CHARACTER_NAME    object
UTTERANCE         object
dtype: object

In [9]:
!ls -alh

total 53M
drwxr-xr-x 3 wotan wotan 4,0K abr 14 17:53 .
drwxr-xr-x 5 wotan wotan 4,0K abr 14 15:56 ..
-rw-r--r-- 1 wotan wotan  23K abr 14 17:53 Cornell_movies_dialogues_database_preprocessng.ipynb
drwxr-xr-x 2 wotan wotan 4,0K abr 14 15:00 .ipynb_checkpoints
-rw-r--r-- 1 wotan wotan  26M abr 14 18:38 movie_lines.csv
-rw-r--r-- 1 wotan wotan  27M abr 14 18:38 movie_lines.pick


__Translate to Spanish__

In [10]:
# Count number of characters in utterances
def count_chars(texts_list):
    count = 0
    for l in texts_list:
        count += len(l)
    return count

count_chars(df_movie_lines.UTTERANCE.values)

16838300

In [11]:
# Get the first 2 million chars
def get_max_index(texts_list, max_length, start_index=0):
    count = 0
    index = start_index
    for l in texts_list[start_index:]:
        count += len(l)
        if count >= max_length:
            return index
        index += 1
    return index


In [12]:
# Get first 2 million characters index
_start_index = 0
index = get_max_index(df_movie_lines.UTTERANCE.values, 2e6, start_index=_start_index)
count_chars(df_movie_lines.UTTERANCE.values[_start_index:index])

1999849

In [13]:
# Split dataframe into <2 million chars chunks
split_indices = []
CHARS_PER_CHUNK = 2e6

_split_index = 0
_chars_len = count_chars(df_movie_lines.UTTERANCE.values[_split_index:])
while (_chars_len > CHARS_PER_CHUNK):
    _split_index = get_max_index(df_movie_lines.UTTERANCE.values, CHARS_PER_CHUNK, start_index=_split_index)
    _chars_len = count_chars(df_movie_lines.UTTERANCE.values[_split_index:])
    split_indices.append(_split_index)
if (split_indices[-1] < len(df_movie_lines)-1):
    split_indices.append(len(df_movie_lines)-1)

# Indices to split the dataframe
print("Split indices: {}".format(split_indices))

for _i, _index in enumerate(split_indices):
    if _i > 0:
        print("{} - {} num chars: {}".format(split_indices[_i-1], _index, count_chars(df_movie_lines.UTTERANCE.values[split_indices[_i-1]:_index])))


Split indices: [36222, 72819, 108282, 144941, 182428, 218582, 254564, 290490, 304712]
36222 - 72819 num chars: 1999754
72819 - 108282 num chars: 1999989
108282 - 144941 num chars: 1999953
144941 - 182428 num chars: 1999930
182428 - 218582 num chars: 1999990
218582 - 254564 num chars: 1999873
254564 - 290490 num chars: 1999961
290490 - 304712 num chars: 838927


In [14]:
MOVIE_LINES_CHUNK_PREFIX = "../preprocessed_data/movie_lines_chunk_{}"
for i, index in enumerate(split_indices):
    if i > 0:
        _df = df_movie_lines.loc[split_indices[i-1]:index]
    else:
        _df = df_movie_lines.loc[:index]
    _df.to_csv(MOVIE_LINES_CHUNK_PREFIX.format(i) + ".csv")
    _df.to_pickle(MOVIE_LINES_CHUNK_PREFIX.format(i) + ".pick")

In [15]:
!ls ../preprocessed_data -alh

total 53M
drwxr-xr-x 2 wotan wotan 4,0K abr 14 16:35 .
drwxr-xr-x 5 wotan wotan 4,0K abr 14 15:56 ..
-rw-r--r-- 1 wotan wotan 3,0M abr 14 18:39 movie_lines_chunk_0.csv
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_0.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_1.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 18:39 movie_lines_chunk_1.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_2.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 18:39 movie_lines_chunk_2.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_3.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 18:39 movie_lines_chunk_3.pick
-rw-r--r-- 1 wotan wotan 3,2M abr 14 18:39 movie_lines_chunk_4.csv
-rw-r--r-- 1 wotan wotan 3,3M abr 14 18:39 movie_lines_chunk_4.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_5.csv
-rw-r--r-- 1 wotan wotan 3,2M abr 14 18:39 movie_lines_chunk_5.pick
-rw-r--r-- 1 wotan wotan 3,1M abr 14 18:39 movie_lines_chunk_6.csv
-rw-r-

### Translate to Spanish
Translate utterances to Spanish using Azure 

In [16]:
# -*- coding: utf-8 -*-
import os, requests, uuid, json

In [17]:
# Checks to see if the Translator Text subscription key is available
# as an environment variable. If you are setting your subscription key as a
# string, then comment these lines out.
if 'TRANSLATOR_TEXT_KEY' in os.environ:
    subscriptionKey = os.environ['TRANSLATOR_TEXT_KEY']
else:
    print('Environment variable for TRANSLATOR_TEXT_KEY is not set.')
    exit()
# If you want to set your subscription key as a string, uncomment the line
# below and add your subscription key.
#subscriptionKey = 'put_your_key_here'

In [46]:
# Requests elements

base_url = 'https://api.cognitive.microsofttranslator.com'
path = '/translate?api-version=3.0'
params = '&to=es'
constructed_url = base_url + path + params

headers = {
    'Ocp-Apim-Subscription-Key': subscriptionKey,
    'Content-type': 'application/json',
    'X-ClientTraceId': str(uuid.uuid4())
}

In [39]:
def translate_text_azure(texts, headers, language="en", destination_language="es"):
    """
    Translate a list of texts to one language.
    """
#     body = [{
#         'text' : text
#     }]
    body = [{"text":t, "from":language, "to":destination_language} for t in texts]
    request = requests.post(constructed_url, headers=headers, json=body)
    response = request.json()
    # Extract results
    res = []
    for r in response:
        # Take the first translation
        res.append(r["translations"][0]["text"])
    return res, response

__Sample response__

    [
        {
            "detectedLanguage": {
                "language": "en",
                "score": 1.0
            },
            "translations": [
                {
                    "text": "Hallo Welt!",
                    "to": "de"
                },
                {
                    "text": "Salve, mondo!",
                    "to": "it"
                }
            ]
        }
    ]


In [21]:
# Load dataframe to translate
df = pd.read_pickle(PREPROCESSED_DATA_DIR+"/movie_lines_chunk_0.pick")

In [22]:
df

Unnamed: 0,LINE_ID,CHARACTER_ID,MOVIE_ID,CHARACTER_NAME,UTTERANCE
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
5,L924,u2,m0,CAMERON,Wow
6,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,CAMERON,No
8,L870,u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


In [36]:
df.iloc[12]

LINE_ID                                                        L866
CHARACTER_ID                                                     u2
MOVIE_ID                                                         m0
CHARACTER_NAME                                              CAMERON
UTTERANCE         I figured you'd get to the good stuff eventually.
Name: 12, dtype: object

In [38]:
_texts = df.UTTERANCE.values[:4]
_texts

array(['They do not!', 'They do to!', 'I hope so.', 'She okay?'],
      dtype=object)

In [47]:
r1, r2 = translate_text_azure(_texts, headers, language="en", destination_language="es")

In [48]:
r2

[{'detectedLanguage': {'language': 'en', 'score': 1.0},
  'translations': [{'text': '¡ No lo hacen!', 'to': 'es'}]},
 {'detectedLanguage': {'language': 'en', 'score': 1.0},
  'translations': [{'text': '¡ Lo hacen!', 'to': 'es'}]},
 {'detectedLanguage': {'language': 'en', 'score': 1.0},
  'translations': [{'text': 'Eso espero.', 'to': 'es'}]},
 {'detectedLanguage': {'language': 'en', 'score': 1.0},
  'translations': [{'text': '¿Está bien?', 'to': 'es'}]}]

In [105]:
class Translator(object):
    _SEPARATOR = "$___$___$"
    
    def __init__(self, language="en", destination_language="es"):
        # Checks to see if the Translator Text subscription key is available
        # as an environment variable. If you are setting your subscription key as a
        # string, then comment these lines out.
        if 'TRANSLATOR_TEXT_KEY' in os.environ:
            subscriptionKey = os.environ['TRANSLATOR_TEXT_KEY']
        else:
            print('Environment variable for TRANSLATOR_TEXT_KEY is not set.')
            exit()

        # If you want to set your subscription key as a string, uncomment the line
        # below and add your subscription key.
        #subscriptionKey = 'put_your_key_here'
        
        base_url = 'https://api.cognitive.microsofttranslator.com'
        path = '/translate?api-version=3.0'
        params = '&to=es'
        self.constructed_url = base_url + path + params

        self.headers = {
            'Ocp-Apim-Subscription-Key': subscriptionKey,
            'Content-type': 'application/json',
            'X-ClientTraceId': str(uuid.uuid4())
        }
        
    def translate_text_azure(self, texts, log_file="./translation_log"):
        """
        Translate a list of texts to one language.
        Args:
            texts: list of texts
            translation_log: file to append translation results
        """
    #     body = [{
    #         'text' : text
    #     }]
        body = [{"text":t} for t in texts]
        request = requests.post(self.constructed_url, headers=self.headers, json=body)
        response = request.json()
        # Extract results
        res = []
        if log_file is not None:
            with open(log_file + "_raw", "a") as f:
                for i, r in enumerate(response):
                    f.write("{} {} {}\n".format(texts[i], Translator._SEPARATOR, r))
        for r in response:
            # Take the first translation
            res.append(r["translations"][0]["text"])
        if log_file is not None:
            with open(log_file, "a") as f:
                for i, r in enumerate(res):
                    f.write("{} {} {}\n".format(texts[i], Translator._SEPARATOR, r))
            with open(log_file + "_raw", "a") as f:
                for i, r in enumerate(response):
                    f.write("{} {} {}\n".format(texts[i], Translator._SEPARATOR, r))
        return res, response     
        

In [106]:
translator = Translator(language="en", destination_language="es")

# r1, r2 = translator.translate_text_azure(_texts)

In [117]:
# translations = []
# complete_responses = []
# _offset = 10035
# for s in tqdm(df.UTTERANCE.values[_offset:]):
#     r1, r2 = translator.translate_text_azure([s])
#     translations.append(r1)
#     complete_responses.append(r2)


In [120]:
_offset = 15346
__indices = []
__num_steps = 20
for i in range(_offset, len(df)):
    if len(__indices) >= __num_steps:    
        r1, r2 = translator.translate_text_azure(df.UTTERANCE.values[__indices])
        __indices = []
        time.sleep(5)
    else:
        __indices.append(i)
    
if len(__indices > 0):
        r1, r2 = translator.translate_text_azure(__indices)


TypeError: string indices must be integers

In [96]:
# df["UTTERANCE_ES"] = translations
# df.to_csv(PREPROCESSED_DATA_DIR + "/movie_lines_chunk_0_translated.csv")
# df.to_pickle(PREPROCESSED_DATA_DIR + "/movie_lines_chunk_0_translated.pick")