In [1]:
import csv
import numpy as np
import os
import pandas as pd
import pickle
import random
import string

from sklearn.decomposition import PCA

In [20]:
glove_data_file = '../glove.6B/glove.6B.50d.txt'
base_dir = 'wiki3029'
new_dir = 'wiki3029_pck'

In [21]:
def load_glove():
    f = open(glove_data_file, 'r')
    model = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        model[word] = embedding
    return model

In [22]:
glove_model = load_glove()

In [23]:
files = [f for f in os.listdir(base_dir) if f.endswith('.txt')]
files.sort()
len(files) # I ignore Aachen because of encoding issues

3028

In [24]:
good_files = files
num_files = len(good_files)
print(num_files)

3028


In [25]:
random.seed(931231)
random.shuffle(good_files)

In [26]:
train_files = good_files[:int(0.7 * num_files)]
val_files = good_files[int(0.7 * num_files):int(0.85 * num_files)]
test_files = good_files[int(0.85 * num_files):]

In [27]:
def normalize_vector(v):
    norm = np.linalg.norm(v)
    if norm < 1e-7:
        return np.zeros(v.shape)
    return v / np.linalg.norm(v)

In [28]:
def process_file(file_path):
    print(file_path)
    tokens_sum, tokens_mean, tokens_norm_sum, tokens_norm_mean = [], [], [], []
    with open(file_path, 'r') as file:
        num_rows = 0
        for row in file:            
            sentence = row.replace('-', ' ').replace("'", '').translate(str.maketrans('', '', string.punctuation))
            sentence = sentence.split()
            embeddings = [glove_model[w.casefold()] for w in sentence if w.casefold() in glove_model]
            num_zeros = np.sum([1 for w in sentence if w.casefold() not in glove_model])

            if num_zeros == len(sentence):
                print([w for w in sentence if w.casefold() not in glove_model])
                embeddings = [np.zeros((50,))]
            
            embeddings_norm = [normalize_vector(e) for e in embeddings]
            
            tokens_sum.append(np.sum(embeddings, axis=0))
            tokens_mean.append(np.mean(embeddings, axis=0))
            tokens_norm_sum.append(np.sum(embeddings_norm, axis=0))
            tokens_norm_mean.append(np.mean(embeddings_norm, axis=0))
            num_rows += 1
    
        print('    ', num_rows)
    print('    ', len(tokens_sum))
    print('    ', len(tokens_mean))
    print('    ', len(tokens_norm_sum))
    print('    ', len(tokens_norm_mean))
    
    assert len(tokens_sum) == 200
    assert len(tokens_mean) == 200
    assert len(tokens_norm_sum) == 200
    assert len(tokens_norm_mean) == 200
    
    return np.array(tokens_sum), np.array(tokens_mean), np.array(tokens_norm_sum), np.array(tokens_norm_mean)

In [29]:
tokens_sum, tokens_mean, tokens_norm_sum, tokens_norm_mean = process_file('wiki3029/Abu_Sayyaf.txt')
print(len(tokens_sum))

wiki3029/Abu_Sayyaf.txt
     200
     200
     200
     200
     200
200


In [30]:
def save_files(my_files, my_set):
    new_path = os.path.join(new_dir, my_set)
    print(new_path)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for f in my_files:
        tokens_sum, tokens_mean, tokens_norm_sum, tokens_norm_mean = process_file(os.path.join(base_dir, f))

        pickle.dump(
            {
             'tokens_sum': tokens_sum,
             'tokens_mean': tokens_mean,
             'tokens_norm_sum': tokens_norm_sum,
             'tokens_norm_mean': tokens_norm_mean,
            },
            open(os.path.join(new_path, f.replace('.txt', '.pck')), 'wb'))

In [31]:
save_files(train_files, 'train')
save_files(val_files, 'val')
save_files(test_files, 'test')

wiki3029_pck/train
wiki3029/English_language.txt
     200
     200
     200
     200
     200
wiki3029/Bulgaria.txt
     200
     200
     200
     200
     200
wiki3029/Dreadnought.txt
     200
     200
     200
     200
     200
wiki3029/Yellow.txt
     200
     200
     200
     200
     200
wiki3029/Permutation.txt
['inbsp', '＼', 'σi']
     200
     200
     200
     200
     200
wiki3029/James_Clerk_Maxwell.txt
     200
     200
     200
     200
     200
wiki3029/Elia_Kazan.txt
     200
     200
     200
     200
     200
wiki3029/Malnutrition.txt
     200
     200
     200
     200
     200
wiki3029/Poland.txt
     200
     200
     200
     200
     200
wiki3029/Everglades.txt
     200
     200
     200
     200
     200
wiki3029/Pipe_organ.txt
     200
     200
     200
     200
     200
wiki3029/Consciousness.txt
     200
     200
     200
     200
     200
wiki3029/Vampire.txt
     200
     200
     200
     200
     200
wiki3029/Afrikaners.txt
     200
     200
     200
   

     200
     200
     200
     200
     200
wiki3029/Western_lowland_gorilla.txt
     200
     200
     200
     200
     200
wiki3029/Somalis.txt
     200
     200
     200
     200
     200
wiki3029/Konrad_Adenauer.txt
     200
     200
     200
     200
     200
wiki3029/Integral.txt
     200
     200
     200
     200
     200
wiki3029/Submarine.txt
     200
     200
     200
     200
     200
wiki3029/Airline.txt
     200
     200
     200
     200
     200
wiki3029/Cannon.txt
     200
     200
     200
     200
     200
wiki3029/Flavian_dynasty.txt
     200
     200
     200
     200
     200
wiki3029/Georges_Clemenceau.txt
     200
     200
     200
     200
     200
wiki3029/Harriet_Tubman.txt
     200
     200
     200
     200
     200
wiki3029/Geisha.txt
     200
     200
     200
     200
     200
wiki3029/Paintball.txt
     200
     200
     200
     200
     200
wiki3029/Dominion.txt
     200
     200
     200
     200
     200
wiki3029/Alcoholism.txt
     200
     200
 

     200
     200
     200
     200
     200
wiki3029/Bayonne.txt
     200
     200
     200
     200
     200
wiki3029/Bear.txt
     200
     200
     200
     200
     200
wiki3029/Arthur_Sullivan.txt
     200
     200
     200
     200
     200
wiki3029/Airport.txt
     200
     200
     200
     200
     200
wiki3029/Tajikistan.txt
     200
     200
     200
     200
     200
wiki3029/Linear_programming.txt
     200
     200
     200
     200
     200
wiki3029/Gene_Kelly.txt
     200
     200
     200
     200
     200
wiki3029/Bucharest.txt
     200
     200
     200
     200
     200
wiki3029/Antarctica.txt
     200
     200
     200
     200
     200
wiki3029/Battle_of_the_Bulge.txt
     200
     200
     200
     200
     200
wiki3029/George_V.txt
     200
     200
     200
     200
     200
wiki3029/Sonar.txt
     200
     200
     200
     200
     200
wiki3029/Stanley_Kubrick.txt
     200
     200
     200
     200
     200
wiki3029/Lisbon.txt
     200
     200
     200
    

     200
     200
     200
     200
     200
wiki3029/Criminal_record.txt
     200
     200
     200
     200
     200
wiki3029/Torture.txt
     200
     200
     200
     200
     200
wiki3029/James_Watt.txt
     200
     200
     200
     200
     200
wiki3029/Force.txt
['PropertyInteraction']
     200
     200
     200
     200
     200
wiki3029/Diesel_engine.txt
     200
     200
     200
     200
     200
wiki3029/Antibody.txt
     200
     200
     200
     200
     200
wiki3029/Insect.txt
     200
     200
     200
     200
     200
wiki3029/Eucalyptus.txt
     200
     200
     200
     200
     200
wiki3029/Cockatoo.txt
     200
     200
     200
     200
     200
wiki3029/Backup.txt
     200
     200
     200
     200
     200
wiki3029/Toga.txt
     200
     200
     200
     200
     200
wiki3029/Skylab.txt
     200
     200
     200
     200
     200
wiki3029/Disgust.txt
     200
     200
     200
     200
     200
wiki3029/Neanderthal.txt
     200
     200
     200
     20

     200
wiki3029/Winter_Olympic_Games.txt
     200
     200
     200
     200
     200
wiki3029/Indonesia.txt
     200
     200
     200
     200
     200
wiki3029/Benghazi.txt
     200
     200
     200
     200
     200
wiki3029/Oscar_Wilde.txt
     200
     200
     200
     200
     200
wiki3029/Entropy.txt
     200
     200
     200
     200
     200
wiki3029/Emanuel_Swedenborg.txt
     200
     200
     200
     200
     200
wiki3029/Calais.txt
     200
     200
     200
     200
     200
wiki3029/Freedom_of_religion.txt
     200
     200
     200
     200
     200
wiki3029/Gold_standard.txt
     200
     200
     200
     200
     200
wiki3029/Joe_Clark.txt
     200
     200
     200
     200
     200
wiki3029/Denmark_Vesey.txt
     200
     200
     200
     200
     200
wiki3029/New_Jersey.txt
['2515', '7760']
['6−3', '4227']
['4−4', '3924']
['4−6', '3922']
['6−2', '4228']
['6−1', '4430']
['1−9', '3416']
     200
     200
     200
     200
     200
wiki3029/Provisional_Irish_

     200
     200
     200
     200
     200
wiki3029/Scott_Joplin.txt
     200
     200
     200
     200
     200
wiki3029/Slow_loris.txt
     200
     200
     200
     200
     200
wiki3029/Cold_fusion.txt
     200
     200
     200
     200
     200
wiki3029/Chartres_Cathedral.txt
     200
     200
     200
     200
     200
wiki3029/Colosseum.txt
     200
     200
     200
     200
     200
wiki3029/Lithuania.txt
     200
     200
     200
     200
     200
wiki3029/Gertrude_Lawrence.txt
     200
     200
     200
     200
     200
wiki3029/United_States_Air_Force_Academy.txt
     200
     200
     200
     200
     200
wiki3029/Opportunism.txt
     200
     200
     200
     200
     200
wiki3029/Rudyard_Kipling.txt
     200
     200
     200
     200
     200
wiki3029/Anthony_Burgess.txt
     200
     200
     200
     200
     200
wiki3029/Orange_Order.txt
     200
     200
     200
     200
     200
wiki3029/Hypnosis.txt
     200
     200
     200
     200
     200
wiki3029/S

     200
     200
     200
     200
     200
wiki3029/Radiocarbon_dating.txt
     200
     200
     200
     200
     200
wiki3029/Fox_hunting.txt
     200
     200
     200
     200
     200
wiki3029/First_Crusade.txt
     200
     200
     200
     200
     200
wiki3029/Military_band.txt
     200
     200
     200
     200
     200
wiki3029/Action_potential.txt
     200
     200
     200
     200
     200
wiki3029/Dortmund.txt
['FileDortmund', 'EllipsonjpgEllipson']
     200
     200
     200
     200
     200
wiki3029/Archimedes.txt
['1114–1187nbspAD']
['Ostomachion']
     200
     200
     200
     200
     200
wiki3029/Loudspeaker.txt
     200
     200
     200
     200
     200
wiki3029/Gaels.txt
     200
     200
     200
     200
     200
wiki3029/Nowruz.txt
     200
     200
     200
     200
     200
wiki3029/Minimum_wage.txt
     200
     200
     200
     200
     200
wiki3029/Taichung.txt
     200
     200
     200
     200
     200
wiki3029/Kazakhstan.txt
     200
     20

     200
     200
     200
     200
     200
wiki3029/Cistercians.txt
     200
     200
     200
     200
     200
wiki3029/Brown_bear.txt
     200
     200
     200
     200
     200
wiki3029/Plesiosauria.txt
     200
     200
     200
     200
     200
wiki3029/Seleucus_I_Nicator.txt
     200
     200
     200
     200
     200
wiki3029/India.txt
     200
     200
     200
     200
     200
wiki3029/Dugong.txt
     200
     200
     200
     200
     200
wiki3029/William_Byrd.txt
     200
     200
     200
     200
     200
wiki3029/Lake_District.txt
     200
     200
     200
     200
     200
wiki3029/Johann_Wolfgang_von_Goethe.txt
     200
     200
     200
     200
     200
wiki3029/Victor_Hugo.txt
['Linstant', 'pleurs', 'superflus']
     200
     200
     200
     200
     200
wiki3029/Provence.txt
     200
     200
     200
     200
     200
wiki3029/Game_theory.txt
     200
     200
     200
     200
     200
wiki3029/Puerto_Rico.txt
     200
     200
     200
     200
     20

     200
     200
     200
     200
     200
wiki3029/Orson_Welles.txt
     200
     200
     200
     200
     200
wiki3029/Bosnia_and_Herzegovina.txt
     200
     200
     200
     200
     200
wiki3029/Manila.txt
     200
     200
     200
     200
     200
wiki3029/Baleen_whale.txt
     200
     200
     200
     200
     200
wiki3029/William_S._Burroughs.txt
     200
     200
     200
     200
     200
wiki3029/International_System_of_Units.txt
     200
     200
     200
     200
     200
wiki3029/Apatosaurus.txt
     200
     200
     200
     200
     200
wiki3029/Opium.txt
     200
     200
     200
     200
     200
wiki3029/Milk.txt
     200
     200
     200
     200
     200
wiki3029/Sukarno.txt
     200
     200
     200
     200
     200
wiki3029/Henry_Kissinger.txt
     200
     200
     200
     200
     200
wiki3029/Nicosia.txt
     200
     200
     200
     200
     200
wiki3029/Potassium.txt
['4nbsp', '2nbsp', '→', '2nbsp', '3nbsp']
     200
     200
     200
     

['8495', '172446832088']
['2187', '3651611211']
['4369', '7973194378']
     200
     200
     200
     200
     200
wiki3029/Tartan.txt
     200
     200
     200
     200
     200
wiki3029/Laurel_and_Hardy.txt
     200
     200
     200
     200
     200
wiki3029/Chongqing.txt
     200
     200
     200
     200
     200
wiki3029/Dating.txt
['httplibrarycqpresscomcqresearcherdocumentphpidcqresrre2015032000']
     200
     200
     200
     200
     200
wiki3029/Bing_Crosby.txt
     200
     200
     200
     200
     200
wiki3029/House_of_Lords.txt
     200
     200
     200
     200
     200
wiki3029/John_Major.txt
     200
     200
     200
     200
     200
wiki3029/Morse_code.txt
     200
     200
     200
     200
     200
wiki3029/Phagocyte.txt
     200
     200
     200
     200
     200
wiki3029/Religion.txt
     200
     200
     200
     200
     200
wiki3029/Sumer.txt
     200
     200
     200
     200
     200
wiki3029/William_Jennings_Bryan.txt
     200
     200
     200

     200
     200
     200
     200
     200
wiki3029/Red_Army_Faction.txt
     200
     200
     200
     200
     200
wiki3029/Trinidad_and_Tobago.txt
     200
     200
     200
     200
     200
wiki3029/Sniper.txt
     200
     200
     200
     200
     200
wiki3029/Piano.txt
     200
     200
     200
     200
     200
wiki3029/Militia.txt
     200
     200
     200
     200
     200
wiki3029/Janis_Joplin.txt
     200
     200
     200
     200
     200
wiki3029/Grover_Cleveland.txt
     200
     200
     200
     200
     200
wiki3029/Common_sense.txt
     200
     200
     200
     200
     200
wiki3029/Reggae.txt
     200
     200
     200
     200
     200
wiki3029/Le_Havre.txt
     200
     200
     200
     200
     200
wiki3029/Benito_Mussolini.txt
     200
     200
     200
     200
     200
wiki3029/Xinjiang.txt
     200
     200
     200
     200
     200
wiki3029/Emu.txt
     200
     200
     200
     200
     200
wiki3029/Mother's_Day.txt
     200
     200
     200
 

['Magnorder', 'Epitheria', 'epitheres']
     200
     200
     200
     200
     200
wiki3029/Camouflage.txt
     200
     200
     200
     200
     200
wiki3029/John_Kenneth_Galbraith.txt
     200
     200
     200
     200
     200
wiki3029/Nablus.txt
     200
     200
     200
     200
     200
wiki3029/Free_love.txt
     200
     200
     200
     200
     200
wiki3029/Halogen.txt
     200
     200
     200
     200
     200
wiki3029/Laser.txt
     200
     200
     200
     200
     200
wiki3029/Creativity.txt
     200
     200
     200
     200
     200
wiki3029/Cougar.txt
     200
     200
     200
     200
     200
wiki3029/Bertrand_Russell.txt
     200
     200
     200
     200
     200
wiki3029/Ecuador.txt
     200
     200
     200
     200
     200
wiki3029/Pain.txt
     200
     200
     200
     200
     200
wiki3029/Ankylosaurus.txt
     200
     200
     200
     200
     200
wiki3029/Sugar_Ray_Robinson.txt
     200
     200
     200
     200
     200
wiki3029/Abortio

     200
     200
     200
     200
     200
wiki3029/Western_culture.txt
     200
     200
     200
     200
     200
wiki3029/Vladimir_Putin.txt
     200
     200
     200
     200
     200
wiki3029/Cigar.txt
     200
     200
     200
     200
     200
wiki3029/Oscilloscope.txt
     200
     200
     200
     200
     200
wiki3029/Limerick.txt
     200
     200
     200
     200
     200
wiki3029/Felix_Mendelssohn.txt
     200
     200
     200
     200
     200
wiki3029/Gray_whale.txt
     200
     200
     200
     200
     200
wiki3029/Tabasco.txt
     200
     200
     200
     200
     200
wiki3029/OLED.txt
     200
     200
     200
     200
     200
wiki3029/Bulletproof_vest.txt
     200
     200
     200
     200
     200
wiki3029/Microcode.txt
     200
     200
     200
     200
     200
wiki3029/Ethan_Allen.txt
     200
     200
     200
     200
     200
wiki3029/Knight.txt
     200
     200
     200
     200
     200
wiki3029/Snakebite.txt
     200
     200
     200
    

     200
     200
     200
     200
     200
wiki3029/Stanley_Baldwin.txt
     200
     200
     200
     200
     200
wiki3029/Lewis_Carroll.txt
     200
     200
     200
     200
     200
wiki3029/Apollo_program.txt
     200
     200
     200
     200
     200
wiki3029/Refrigerator.txt
     200
     200
     200
     200
     200
wiki3029/Albert_Schweitzer.txt
     200
     200
     200
     200
     200
wiki3029/Osama_bin_Laden.txt
     200
     200
     200
     200
     200
wiki3029/Nahuatl.txt
     200
     200
     200
     200
     200
wiki3029/Maharashtra.txt
     200
     200
     200
     200
     200
wiki3029/Cheque.txt
     200
     200
     200
     200
     200
wiki3029/Bed_bug.txt
     200
     200
     200
     200
     200
wiki3029/Amphibian.txt
     200
     200
     200
     200
     200
wiki3029/Violin.txt
     200
     200
     200
     200
     200
wiki3029/Calvin_Coolidge.txt
     200
     200
     200
     200
     200
wiki3029/Jacques_Offenbach.txt
     200
 

     200
     200
     200
     200
     200
wiki3029/Margaret_Thatcher.txt
     200
     200
     200
     200
     200
wiki3029/Trade_union.txt
     200
     200
     200
     200
     200
wiki3029/Steam_locomotive.txt
     200
     200
     200
     200
     200
wiki3029/Oregon.txt
['FilePortland', 'panorama3jpgPortland']
     200
     200
     200
     200
     200
wiki3029/George_Mason.txt
     200
     200
     200
     200
     200
wiki3029/Chester_A._Arthur.txt
     200
     200
     200
     200
     200
wiki3029/Michigan.txt
     200
     200
     200
     200
     200
wiki3029/Vulgate.txt
     200
     200
     200
     200
     200
wiki3029/Bhutan.txt
     200
     200
     200
     200
     200
wiki3029/Star.txt
     200
     200
     200
     200
     200
wiki3029/Monogamy.txt
     200
     200
     200
     200
     200
wiki3029/Boxing.txt
['Filejab7jpgJab']
     200
     200
     200
     200
     200
wiki3029/J._D._Salinger.txt
     200
     200
     200
     200
     

     200
     200
     200
     200
     200
wiki3029/Princeton_University.txt
     200
     200
     200
     200
     200
wiki3029/Headphones.txt
     200
     200
     200
     200
     200
wiki3029/Modernism.txt
     200
     200
     200
     200
     200
wiki3029/Bangladesh.txt
     200
     200
     200
     200
     200
wiki3029/Meningitis.txt
     200
     200
     200
     200
     200
wiki3029/Nursing.txt
     200
     200
     200
     200
     200
wiki3029/Mimicry.txt
     200
     200
     200
     200
     200
wiki3029/Pub.txt
     200
     200
     200
     200
     200
wiki3029/Bishop.txt
     200
     200
     200
     200
     200
wiki3029/Sonata_form.txt
     200
     200
     200
     200
     200
wiki3029/United_Nations.txt
     200
     200
     200
     200
     200
wiki3029/Duel.txt
     200
     200
     200
     200
     200
wiki3029/Near_East.txt
     200
     200
     200
     200
     200
wiki3029/Mandolin.txt
     200
     200
     200
     200
     200
w

     200
     200
     200
     200
     200
wiki3029/Whale.txt
     200
     200
     200
     200
     200
wiki3029/Marketing_research.txt
     200
     200
     200
     200
     200
wiki3029/Stavanger.txt
     200
     200
     200
     200
     200
wiki3029/Photosynthesis.txt
     200
     200
     200
     200
     200
wiki3029/Reform_Judaism.txt
     200
     200
     200
     200
     200
wiki3029/Copenhagen.txt
     200
     200
     200
     200
     200
wiki3029/Henry_Clay.txt
     200
     200
     200
     200
     200
wiki3029/Richard_Burton.txt
     200
     200
     200
     200
     200
wiki3029/Kiev.txt
     200
     200
     200
     200
     200
wiki3029/Library.txt
     200
     200
     200
     200
     200
wiki3029/Manchester.txt
     200
     200
     200
     200
     200
wiki3029/Golden_eagle.txt
     200
     200
     200
     200
     200
wiki3029/P._G._Wodehouse.txt
     200
     200
     200
     200
     200
wiki3029/Speech_perception.txt
     200
     2

     200
     200
     200
     200
     200
wiki3029/Bicycle_wheel.txt
     200
     200
     200
     200
     200
wiki3029/Great_Lakes.txt
['barErie', 'from359', 'till569', 'width49', 'colorblue2']
['idtextoutsidebar', 'valueredorange']
['maxdepth', 'at359', 'text210nbspft', '64nbspm']
['idtextinbar', 'valueyelloworange']
['ImageSize', 'width595', 'height250']
['avgdepth', 'at298', 'shift02', 'text279nbspft', '85nbspm']
['TimeAxis', 'orientationvertical']
['idblue3', 'valuergb0202085']
['idblue2', 'valuergb010108']
['idblue4', 'valuergb030309']
['at243', 'textsurfaceelevation']
['PlotArea', 'width525', 'height200', 'left50', 'bottom15']
     200
     200
     200
     200
     200
wiki3029/Inception.txt
     200
     200
     200
     200
     200
wiki3029/Colima.txt
     200
     200
     200
     200
     200
wiki3029/Detroit.txt
     200
     200
     200
     200
     200
wiki3029/Will_Rogers.txt
     200
     200
     200
     200
     200
wiki3029/Bill_Clinton.txt
     200
   

     200
     200
     200
     200
     200
wiki3029/Robert_the_Bruce.txt
     200
     200
     200
     200
     200
wiki3029/Washington_Monument.txt
     200
     200
     200
     200
     200
wiki3029/Nairobi.txt
     200
     200
     200
     200
     200
wiki3029/Crown_jewels.txt
     200
     200
     200
     200
     200
wiki3029/Amelia_Earhart.txt
     200
     200
     200
     200
     200
wiki3029/Gas_turbine.txt
     200
     200
     200
     200
     200
wiki3029/Progressive_rock.txt
     200
     200
     200
     200
     200
wiki3029/Omar_Bradley.txt
     200
     200
     200
     200
     200
wiki3029/Ian_Smith.txt
     200
     200
     200
     200
     200
wiki3029/Tunisia.txt
     200
     200
     200
     200
     200
wiki3029/Working_memory.txt
     200
     200
     200
     200
     200
wiki3029/Soap_opera.txt
     200
     200
     200
     200
     200
wiki3029/Kuala_Lumpur.txt
     200
     200
     200
     200
     200
wiki3029/Tinamou.txt
     200

     200
     200
     200
     200
     200
wiki3029/Electric_motor.txt
['AsynchronousMachines']
     200
     200
     200
     200
     200
wiki3029/Jesse_James.txt
     200
     200
     200
     200
     200
wiki3029/Sparkling_wine.txt
     200
     200
     200
     200
     200
wiki3029/Roundabout.txt
     200
     200
     200
     200
     200
wiki3029/Zeppelin.txt
     200
     200
     200
     200
     200
wiki3029/Edmonton.txt
     200
     200
     200
     200
     200
wiki3029/Sound_film.txt
     200
     200
     200
     200
     200
wiki3029/Hippeastrum.txt
['Aschamia', 'Salisb']
['Omphalissa', 'Salisb']
     200
     200
     200
     200
     200
wiki3029/Allosaurus.txt
     200
     200
     200
     200
     200
wiki3029/Plasterwork.txt
     200
     200
     200
     200
     200
wiki3029/Dodge.txt
     200
     200
     200
     200
     200
wiki3029/Lycia.txt
     200
     200
     200
     200
     200
wiki3029/Andrei_Tarkovsky.txt
     200
     200
     200


     200
     200
     200
     200
     200
wiki3029/Great_auk.txt
     200
     200
     200
     200
     200
wiki3029/Forgiveness.txt
     200
     200
     200
     200
     200
wiki3029/Heavy_metal_music.txt
     200
     200
     200
     200
     200
wiki3029/Rosetta_Stone.txt
     200
     200
     200
     200
     200
wiki3029/Iron.txt
     200
     200
     200
     200
     200
wiki3029/Krishna.txt
     200
     200
     200
     200
     200
wiki3029/Sundial.txt
     200
     200
     200
     200
     200
wiki3029/Marriage.txt
     200
     200
     200
     200
     200
wiki3029/Napoleon_III.txt
     200
     200
     200
     200
     200
wiki3029/Vannevar_Bush.txt
     200
     200
     200
     200
     200
wiki3029/Offal.txt
     200
     200
     200
     200
     200
wiki3029/Alps.txt
     200
     200
     200
     200
     200
wiki3029/Nanjing.txt
     200
     200
     200
     200
     200
wiki3029/Republic_of_Ireland.txt
     200
     200
     200
     200
  

     200
     200
     200
     200
     200
wiki3029/Momentum.txt
     200
     200
     200
     200
     200
wiki3029/Malawi.txt
     200
     200
     200
     200
     200
wiki3029/Chardonnay.txt
     200
     200
     200
     200
     200
wiki3029/Sam_Houston.txt
     200
     200
     200
     200
     200
wiki3029/Music_hall.txt
     200
     200
     200
     200
     200
wiki3029/Anatomy.txt
     200
     200
     200
     200
     200
wiki3029/Goat.txt
     200
     200
     200
     200
     200
wiki3029/Liberation_Tigers_of_Tamil_Eelam.txt
     200
     200
     200
     200
     200
wiki3029/New_Year's_Eve.txt
     200
     200
     200
     200
     200
wiki3029/Sweden.txt
     200
     200
     200
     200
     200
wiki3029/Tijuana.txt
     200
     200
     200
     200
     200
wiki3029/Butterfly.txt
     200
     200
     200
     200
     200
wiki3029/Afterlife.txt
     200
     200
     200
     200
     200
wiki3029/John_Wesley.txt
     200
     200
     200
   

     200
     200
     200
     200
wiki3029/Kolkata.txt
     200
     200
     200
     200
     200
wiki3029/Navajo.txt
     200
     200
     200
     200
     200
wiki3029/Nuclear_reactor.txt
     200
     200
     200
     200
     200
wiki3029/Hot_Springs_National_Park.txt
     200
     200
     200
     200
     200
wiki3029/Mogadishu.txt
     200
     200
     200
     200
     200
wiki3029/Mummy.txt
     200
     200
     200
     200
     200
wiki3029/Sydney.txt
     200
     200
     200
     200
     200
wiki3029/Theatre.txt
     200
     200
     200
     200
     200
wiki3029/Pedophilia.txt
     200
     200
     200
     200
     200
wiki3029/Michelangelo.txt
     200
     200
     200
     200
     200
wiki3029/United_Arab_Emirates.txt
     200
     200
     200
     200
     200
wiki3029/Cannabis.txt
     200
     200
     200
     200
     200
wiki3029/Jan_van_Eyck.txt
     200
     200
     200
     200
     200
wiki3029/Kola_Peninsula.txt
     200
     200
     200


     200
     200
     200
     200
     200
wiki3029/Fidel_Castro.txt
     200
     200
     200
     200
     200
wiki3029/Mahjong.txt
     200
     200
     200
     200
     200
wiki3029/C._S._Lewis.txt
     200
     200
     200
     200
     200
wiki3029/Gestapo.txt
     200
     200
     200
     200
     200
wiki3029/Aalborg.txt
     200
     200
     200
     200
     200
wiki3029/Gulag.txt
     200
     200
     200
     200
     200
wiki3029/Decolonization.txt
     200
     200
     200
     200
     200
wiki3029/Ulster.txt
     200
     200
     200
     200
     200
wiki3029/Jews.txt
     200
     200
     200
     200
     200
wiki3029/Agatha_Christie.txt
     200
     200
     200
     200
     200
wiki3029/Rama.txt
     200
     200
     200
     200
     200
wiki3029/Latvia.txt
     200
     200
     200
     200
     200
wiki3029/United_States_Military_Academy.txt
     200
     200
     200
     200
     200
wiki3029/Rheumatoid_arthritis.txt
     200
     200
     200

     200
     200
     200
     200
     200
wiki3029/Conquistador.txt
     200
     200
     200
     200
     200
wiki3029/Lithium.txt
     200
     200
     200
     200
     200
wiki3029/University_of_Pennsylvania.txt
     200
     200
     200
     200
     200
wiki3029/Amplifier.txt
     200
     200
     200
     200
     200
wiki3029/Learning.txt
     200
     200
     200
     200
     200
wiki3029/Drill_bit.txt
     200
     200
     200
     200
     200
wiki3029/Kubla_Khan.txt
     200
     200
     200
     200
     200
wiki3029/New_Orleans.txt
['697nbspft', '212nbspm']
['481nbspft', '147nbspm']
     200
     200
     200
     200
     200
wiki3029/Robert_Burns.txt
     200
     200
     200
     200
     200
wiki3029/Wildcat.txt
['Tristrams', 'wildcatF']
     200
     200
     200
     200
     200
wiki3029/People's_Liberation_Army.txt
     200
     200
     200
     200
     200
wiki3029/Radio.txt
     200
     200
     200
     200
     200
wiki3029/Abu_Dhabi.txt
     2

['כל', 'חמירא', 'וחמיעא', 'דאכא', 'ברשותי', 'דלא', 'חמתה', 'ודלא', 'בערתה', 'ודלא', 'ידענא', 'לה', 'לבטל', 'ולהוי', 'הפקר', 'כעפרא', 'דארעא']
     200
     200
     200
     200
     200
wiki3029/Zambia.txt
     200
     200
     200
     200
     200
wiki3029/Beetle.txt
     200
     200
     200
     200
     200
wiki3029/Thomas_Edison.txt
     200
     200
     200
     200
     200
wiki3029/Seed.txt
     200
     200
     200
     200
     200
wiki3029/Mesa_Verde_National_Park.txt
     200
     200
     200
     200
     200
wiki3029/Down_syndrome.txt
     200
     200
     200
     200
     200
wiki3029/Parashurama.txt
     200
     200
     200
     200
     200
wiki3029/Plough.txt
     200
     200
     200
     200
     200
wiki3029/Salt_Lake_City.txt
     200
     200
     200
     200
     200
wiki3029/Doctorate.txt
['MScMBALLMMA']
     200
     200
     200
     200
     200
wiki3029/Mount_St._Helens.txt
     200
     200
     200
     200
     200
wiki3029/Ant.txt
     200


     200
     200
     200
     200
     200
wiki3029/Self-determination.txt
     200
     200
     200
     200
     200
wiki3029/Louis_Leakey.txt
     200
     200
     200
     200
     200
wiki3029/Pharmacist.txt
     200
     200
     200
     200
     200
wiki3029/Richard_Feynman.txt
     200
     200
     200
     200
     200
wiki3029/United_States_Marine_Corps.txt
     200
     200
     200
     200
     200
wiki3029/Hypocorism.txt
['Amalija', '→', 'Malija', 'Malči']
['Frančiška', '→', 'Francka']
['משה', 'Moyshe', '→', 'משהלה', 'Moyshele', 'Moyshele']
     200
     200
     200
     200
     200
wiki3029/Pancreatic_cancer.txt
     200
     200
     200
     200
     200
wiki3029/Daniel_Boone.txt
     200
     200
     200
     200
     200
wiki3029/Gentrification.txt
     200
     200
     200
     200
     200
wiki3029/Bette_Davis.txt
     200
     200
     200
     200
     200
wiki3029/Luftwaffe.txt
     200
     200
     200
     200
     200
wiki3029/Frederick_Delius.txt


     200
     200
     200
     200
     200
wiki3029/Hannibal.txt
     200
     200
     200
     200
     200
wiki3029/Mumbai.txt
     200
     200
     200
     200
     200
wiki3029/Onion.txt
['viviparum', 'Metzg']
     200
     200
     200
     200
     200
wiki3029/Photon.txt
     200
     200
     200
     200
     200
wiki3029/Greenland.txt
     200
     200
     200
     200
     200
wiki3029/St._Johns_River.txt
     200
     200
     200
     200
     200
wiki3029/Frank_Whittle.txt
     200
     200
     200
     200
     200
wiki3029/San_Francisco.txt
     200
     200
     200
     200
     200
wiki3029/Six-Day_War.txt
     200
     200
     200
     200
     200
wiki3029/Thirty_Years'_War.txt
     200
     200
     200
     200
     200
wiki3029/Humphrey_Bogart.txt
     200
     200
     200
     200
     200
wiki3029/Stegosaurus.txt
     200
     200
     200
     200
     200
wiki3029/Moose.txt
     200
     200
     200
     200
     200
wiki3029/Ear.txt
     200
     