In [35]:
from sklearn.model_selection import train_test_split
import itertools
import collections

NUMBER_OF_SENTENCES = 100

Token = collections.namedtuple("Token", ["index", "word"])
SOS = Token(0, "<sos>")
EOS = Token(1, "<eos>")
PAD = Token(2, "<pad>")

In [36]:
def load_from_big_file(file, sentence_length = 10):
    s = []
    
    with open(file) as f:
        lines = f.readlines()
    
        for line in lines[:NUMBER_OF_SENTENCES]:
            line = line.strip()
            line = line.rstrip(".")
            words = line.split()
            if len(words) >= 10:
                sent = " ".join(words[:sentence_length])
                sent += " ."
            else:
                sent = " ".join(words)
                sent += " ."
                sent += (" "+PAD.word) * (sentence_length - len(words))
            s.append(sent)
    
    s_train, s_test= train_test_split(s, shuffle = True, test_size=0.1, random_state=42)
    return s_train, s_test

In [37]:
f = '../data/JAK2New.txt'

In [38]:
s_train, s_test = load_from_big_file(f)

In [39]:
def fetch_vocab(DATA_GERMAN, DATA_ENGLISH, DATA_GERMAN2): # -> typing.Tuple[typing.List[str], typing.Dict[str, int]]:
    """Determines the vocabulary, and provides mappings from indices to words and vice versa.
    
    Returns:
        tuple: A pair of mappings, index-to-word and word-to-index.
    """
    # gather all (lower-cased) words that appear in the data
    all_words = set()
    for sentence in itertools.chain(DATA_GERMAN, DATA_ENGLISH, DATA_GERMAN2):
        all_words.update(word.lower() for word in sentence.split(" ") if word != PAD.word) 
    
    # create mapping from index to word
    idx_to_word = [SOS.word, EOS.word, PAD.word] + list(sorted(all_words))
    
    # create mapping from word to index
    word_to_idx = {word: idx for idx, word in enumerate(idx_to_word)}
   
    return idx_to_word, word_to_idx

In [40]:
idx_to_word, word_to_idx = fetch_vocab(s_train, s_train, s_test)

In [42]:
word_to_idx

{'<sos>': 0,
 '<eos>': 1,
 '<pad>': 2,
 '#': 3,
 '(': 4,
 ')': 5,
 '+': 6,
 '-': 7,
 '.': 8,
 '1': 9,
 '2': 10,
 '=': 11,
 '@': 12,
 '[': 13,
 '\\': 14,
 ']': 15,
 'b': 16,
 'c': 17,
 'f': 18,
 'h': 19,
 'l': 20,
 'n': 21,
 'o': 22,
 'r': 23,
 's': 24}