In [None]:
import os
import xml.etree.ElementTree as ET
import pickle
from zipfile import ZipFile
import time
from datetime import datetime
import spacy
nlp = spacy.load('en_core_web_sm')

# Extracting folders from the original zip-file

In [2]:
zipfile = "D:\\en.zip"
years = [str(year) for year in range(2000, 2019)]
to_extract = ['OpenSubtitles/xml/en/{}'.format(year) for year in years]

with ZipFile(zipfile, 'r') as archive:
    for path in to_extract:
        archive.extract(path)

# Filtering the original archive

Goal is to delete redundant files that don't need to be kept. 


In [None]:
output_dir = 'out'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
path_dir = 'OpenSubtitles/xml/en'
years = os.listdir(path_dir)
movies = {year: os.listdir(os.path.join(path_dir, year)) for year in years}

In [1]:
# with checking xml
'''
to_delete = []
to_keep = []
years_skip = []

for year, year_movies in movies.items():
    if year in years_skip:
        print("Skipping {}...".format(year))
        continue
    print("Year {} ({} movies)".format(year, len(year_movies)))
    for movie in year_movies: 
        movie_path = os.path.join(path_dir, year, movie)
        alternatives = os.listdir(movie_path)
        success = False
        for i, alt in enumerate(alternatives):
            xml_path = os.path.join(movie_path, alt)
            if not success:
                with open(xml_path, 'r', encoding='utf-8') as xml_file:
                    try:
                        tree = ET.parse(xml_file)
                        success = True
                        to_keep.append(xml_path)
                    except Exception as e:
                        to_delete.append(xml_path)
            else:
                to_delete.append(xml_path)
        if not success:
            print("No xml available for {}".format(movie_path))
#print(to_delete)
pickle.dump({'to_delete': to_delete, 'to_keep': to_keep}, open('to_delete_keep.pkl', 'wb'))
'''

'\nto_delete = []\nto_keep = []\nyears_skip = []\n\nfor year, year_movies in movies.items():\n    if year in years_skip:\n        print("Skipping {}...".format(year))\n        continue\n    print("Year {} ({} movies)".format(year, len(year_movies)))\n    for movie in year_movies: \n        movie_path = os.path.join(path_dir, year, movie)\n        alternatives = os.listdir(movie_path)\n        success = False\n        for i, alt in enumerate(alternatives):\n            xml_path = os.path.join(movie_path, alt)\n            if not success:\n                with open(xml_path, \'r\', encoding=\'utf-8\') as xml_file:\n                    try:\n                        tree = ET.parse(xml_file)\n                        success = True\n                        to_keep.append(xml_path)\n                    except Exception as e:\n                        to_delete.append(xml_path)\n            else:\n                to_delete.append(xml_path)\n        if not success:\n            print("No xml av

In [None]:
# withOUT checking xml (faster)
to_delete = []
to_keep = []
for year, year_movies in movies.items():
    print("Year {} ({} movies)".format(year, len(year_movies)))
    for movie in year_movies: 
        movie_path = os.path.join(path_dir, year, movie)
        alternatives = os.listdir(movie_path)
        to_keep.append(alternatives[0])
        if len(alternatives) > 1:
            to_delete += alternatives[1:]
            
#pickle.dump({'to_delete': to_delete, 'to_keep': to_keep}, open(os.path.join(output_dir, 'to_delete_keep.pkl'), 'wb'))

In [None]:
#with open(os.path.join(output_dir, 'to_delete_keep.pkl'), 'rb') as f:
#    dk1 = pickle.load(f)    
#to_delete = dk1['to_delete']
#to_keep = dk1['to_keep']
len(to_keep), len(to_delete)

### Delete redundant files

Deletes files previously extracted from the original zip file that don't need to be kept. 

In [None]:
for file_path in to_delete: 
    try:
        os.remove(file_path)
    except FileNotFoundError as e:
        print('File not found', file_path)
    path, filename = os.path.split(file_path)
    if os.path.isdir(path) and len(os.listdir(path)) == 0:
        try:
            os.rmdir(path)
            print('Directory {} removed'.format(path))
        except FileNotFoundError as e:
            print('Directory not found', path)

# Parsing and lemmatization

This step takes a long time. On my laptop it ran for about a week. 
It produces a pickle file for every year containing a dictionary indexed by movie IDs and containing word-count dictionaries.

In [None]:
bprint_every = 1000
counter = 0
limit = -1
num_movies = sum([len(movie_list) for year, movie_list in movies.items()])
to_process = (min(limit, num_movies) if limit > 0 else num_movies)
print('{} movies to process'.format(to_process))
print('{} movies in total'.format(num_movies))

time_start = time.time()
for year, movie_list in movies.items():
    movie_word_counts = {}
    print('Year {}'.format(year))
    for movie in movie_list:
        if counter % print_every == 0 and counter > 0:
            rate = print_every/(time.time()-time_start)
            eta = (to_process-counter)/rate
            print('{}/{}, rate={}/s, eta=t+{}s ({})'.format(counter, num_movies, rate, eta, datetime.fromtimestamp(time.time()+int(eta))))
            time_start = time.time()
        movie_path = os.path.join(path_dir, year, movie)
        xml_path = os.path.join(movie_path, os.listdir(movie_path)[0])
        word_counts = {}
        with open(xml_path, 'r') as xml_file:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            for child in root:
                sequence = ''
                for w in child.findall('w'):
                    if w.text[0]=="'":
                        sequence += w.text
                    elif len(sequence) == 0:
                        sequence += w.text
                    else:
                        sequence += ' '+w.text
                tokens = nlp(sequence)
                lemmas = [str(token.lemma_).lower() for token in tokens if not token.is_punct]
                for lemma in lemmas:
                    word_counts[lemma] = word_counts.get(lemma, 0)+1
        movie_word_counts[movie] = word_counts
        counter += 1
        if counter == limit:
            break
    pickle.dump(movie_word_counts, open(os.path.join(output_dir, '{}-word-counts.pkl'.format(str(year))), 'wb'))
    if counter == limit:
        break

# Putting it together

Create a sparse word-document matrix and a vocabulary.

### Creating vocabulary

In [None]:
years = os.listdir(path_dir)[:-1] # leave out 2018
corpus_by_year = {}
total=0

for year in years:
    word_count = {}
    failed_count = 0
    with open(os.path.join(output_dir, '{}-word-counts.pkl'.format(year)), 'rb') as f:
        movies = pickle.load(f)
        print('Year {}, {} movies'.format(year, len(movies)))
        for movie, wc in movies.items():
            for word, count in wc.items():
                word_count[word] = word_count.get(word, 0)+count
                total+=count
    corpus_by_year[year]=word_count
print('{} words in total'.format(total))

In [None]:
stimuli = pd.read_csv('./BLP/blp-items.txt', sep='\t', usecols=['spelling', 'lexicality'])
stimuli = stimuli[stimuli['lexicality']=='W']['spelling'].values.flatten()

vocab = set()
for year, wc in corpus_by_year.items():
    vocab = vocab.union(wc.keys())
print(len(vocab))

vocab_blp = vocab.intersection(stimuli)
vocab_blp = sorted(list(vocab_blp))
word2idx = {w:i for i, w in enumerate(vocab_blp)}
len(vocab_blp)

pickle.dump(vocab_dict, open(os.path.join(output_dir, 'vocab.pkl'), 'wb'))

### Creating word-document matrix

In [None]:
yearly_ndocs = []
for year in years:
    with open(os.path.join(output_dir, '{}-word-counts.pkl'.format(year)), 'rb') as f:
        movies = pickle.load(f)
        yearly_ndocs.append(len(movies))
ndocs = sum(yearly_ndocs)
print('{} documents in total'.format(ndocs))