In [1]:
import re
import spacy
from spacy.tokenizer import Tokenizer
from os import listdir
from os.path import isfile, join


In [2]:
nlp = spacy.load('en', disable=['tagging','parser','ner'])

nlp.add_pipe(nlp.create_pipe('sentencizer'))

# Trucks Wiki:
#texts_dir = "/home/johannes/hypernym_discovery_data/fodina/files_for_exjobb_2019-02-04/extracted_text_trucks/"
#save_location = "/home/johannes/thesis_code/data_experimentation/tokenized_texts/tokenized_truck.txt"

# Volvo manual:
texts_dir = "/home/johannes/hypernym_discovery_data/fodina/Volvo_FH_till_Johannes_2019-03-07/extracted_text/"
save_location = "/home/johannes/thesis_code/data_experimentation/volvo_data/tokenized_sentences.txt"


IGNORE_TOKENS = ['༔','༅','༒','༖','༗','']
DIVIDERS = ['༎']
IGNORE_WHOLE_LINE = ['\\']

In [3]:
def strip_tibet_chars(filename):
    tf = open(filename,'r')
    texts_list = []
    for line in tf:
        ignore_line=False
        for ic in IGNORE_WHOLE_LINE:
            if ic in line:
                ignore_line=True
        if ignore_line:
            continue 
        for div in DIVIDERS:
            line = line.replace(div,'\n')
        clean_line = line.split('\n')
        texts_list += [l for l in clean_line if l]
    return texts_list

In [4]:
def get_filenames(files_dir):
    return [f for f in listdir(files_dir) if isfile(join(files_dir, f))]

In [5]:
files_list = get_filenames(texts_dir)

In [6]:
def extract_text(texts):
    #tf = open(filename,'r')
    texts_list = []
    for line in texts:
        ignore_line=False
        for ic in IGNORE_WHOLE_LINE:
            if ic in line:
                ignore_line=True
        if ignore_line:
            continue
        clean_line = ""
        for c in line:
            if c in IGNORE_TOKENS:
                if clean_line.strip('\n').strip(' '):
                    texts_list.append(clean_line.replace(u'\xa0', u' ').strip('\n'))
                    clean_line=""
                continue
            clean_line += c
        if clean_line.strip('\n').strip(' '):
            texts_list.append(clean_line.replace(u'\xa0', u' ').strip('\n'))
    #tf.close()
    return texts_list

In [7]:
def make_tokenized_doc_list(texts_list):
    
    
    #return [tokenizer(line) for line in texts_list]
    return [nlp(line) for line in texts_list]

In [8]:
def write_tokenized_text_to_file(doc_list, file_path):
    fp = open(file_path,'w+')
    for doc in doc_list:
        sentences = doc.sents
        #if len(list(sentences)) > 1:
        for s in sentences:
            line = ""
            for t in s:
                if t.text in [' ','\t']:
                    continue
                line += t.text
                if t.text != '\n':
                    line += ' '
            if line.strip(' ').strip('\n'):
                fp.write(line)
                fp.write('\n')
        #continue
                    
        """for t in doc:
            
            if t.text == ' ':
                continue
            fp.write(t.text)
            if t.text != '\n':
                fp.write(' ')"""
    fp.close()

In [9]:
def aggregate_all(files, location):
    intermediate = []
    all_lines = []
    print('Extracting lines from ',len(files), 'files\n')
    
    for filename in files:
        intermediate += strip_tibet_chars(texts_dir+filename)
    
    all_lines = extract_text(intermediate)
    print('Tokenizing ', len(all_lines),'lines\n')
    docs = make_tokenized_doc_list(all_lines)
    print('Writing to file\n')
    write_tokenized_text_to_file(docs, location)

In [10]:
aggregate_all(files_list,save_location)

Extracting lines from  28 files

Tokenizing  52597 lines

Writing to file



In [12]:
def concatenate_files(filename1, filename2, write_filename):
    f1 = open(filename1,'r')
    f2 = open(filename2,'r')
    wf = open(write_filename,'w+')
    line_num = 0
    for line in f1:
        line_num += 1
        wf.write(line)
        if((line_num % 10000000) == 0):
            print("Wrote ",str(line_num),"lines\n")
    f1.close()
    print("First file done!\n")
    line_num = 0
    for line in f2:
        line_num += 1
        wf.write(line)
        if((line_num % 10000000) == 0):
            print("Wrote ",str(line_num),"lines\n")
        wf.write(line)
    print("Wrote ", str(line_num), "lines from second file\n")
    f2.close()
    wf.close()

In [13]:
filename1 = "/home/johannes/hypernym_discovery_data/UMBC_tokenized.txt"
filename2 = "/home/johannes/thesis_code/data_experimentation/volvo_data/tokenized_sentences.txt"
filename3 = "/home/johannes/hypernym_discovery_data/data_for_volvo_run/UMBC_volvo_tokenized.txt"
concatenate_files(filename1,filename2,filename3)

Wrote  10000000 lines

Wrote  20000000 lines

Wrote  30000000 lines

Wrote  40000000 lines

Wrote  50000000 lines

Wrote  60000000 lines

Wrote  70000000 lines

Wrote  80000000 lines

Wrote  90000000 lines

Wrote  100000000 lines

Wrote  110000000 lines

Wrote  120000000 lines

Wrote  130000000 lines

First file done!

Wrote  42903 lines from second file



In [16]:
testboi = strip_tibet_chars(texts_dir+files_list[0])
texts_list = extract_text(testboi)

In [13]:
print(testboi[10])

584


In [14]:
print(len(texts_list))

1302


In [20]:
print_num = 17
for i in range(print_num):
    print(testboi[i])
    print("\n_________")
    print(texts_list[i])
    print("############\n")

༒༒༒༒༒Bogie

_________
Bogie
############

Variant symbols

_________
Variant symbols
############

SubChapter: ((RADD-GR or RADT-GR or RAPD-GR or RADDT-GR or RAPDD-GR or RADD-G2 or RADDT-G2 or RAPDT-GR or RAPD-G4)) AND

_________
SubChapter: ((RADD-GR or RADT-GR or RAPD-GR or RADDT-GR or RAPDD-GR or RADD-G2 or RADDT-G2 or RAPDT-GR or RAPD-G4)) AND
############

Section: (RSS-AIR) AND

_________
Section: (RSS-AIR) AND
############

Topic: (((RADD-GR or RADT-GR or RAPD-GR or RADDT-GR or RAPDD-GR or RADD-G2 or RADDT-G2 or RAPDT-GR or RAPD-G4) and RSS-AIR)) AND

_________
Topic: (((RADD-GR or RADT-GR or RAPD-GR or RADDT-GR or RAPDD-GR or RADD-G2 or RADDT-G2 or RAPDT-GR or RAPD-G4) and RSS-AIR)) AND
############

Sub-Section: (RAPDT-GR)

_________
Sub-Section: (RAPDT-GR)
############

Axle load distribution

_________
Axle load distribution
############

To obtain optimum traction the air suspension system can redistribute the weight between the driven axle and other axles. How much extra w

In [15]:
''.encode()

b'\x0e'