In [1]:
import os
import json
import re
import string
import pandas as pd
import numpy as np
from libvoikko import Voikko
from gensim.models import FastText
from gensim.models import Word2Vec

In [2]:
voikko = Voikko("fi")

def clean(s):
    s = re.sub("\n", " ", s)
    s = re.sub("  ", " ", s)
    s = s.strip()
    return(s)

def remove_punct(s):
    s = s.translate(str.maketrans('', '', string.punctuation + string.digits))
    s = re.sub("§(\S+)?", "", s)
    s = re.sub(" +", " ", s)
    s = re.sub("(^| ).( |$)", " ", s)
    s = re.sub("–", "", s)
    s = re.sub('”', "", s)
    s = s.strip()
    s = s.lower()
    return(s)

def tokenize_sentences(text, voikko):
    return [s.sentenceText.strip() for s in voikko.sentences(text)]

## FINLEX-data

In [2]:
paths = ["data/kho/" + x for x in os.listdir("data/kho")]
paths += ["data/kko/" + x for x in os.listdir("data/kko")]
paths += ["data/sd/" + x for x in os.listdir("data/sd")]

In [3]:
data = []

for p in paths:
    with open(p, 'r') as f:
        data += json.load(f)
        
df = pd.DataFrame(data)
del data

In [4]:
df.sort_values("year", inplace = True)
df.reset_index(inplace = True)
df.drop("index", 1, inplace = True)

In [12]:
texts = list(df['text'])

sentences = []
for i in range(0, len(texts)):
    tokens = tokenize_sentences( clean(texts[i]), voikko )
    sentences += [remove_punct(t).split() for t in tokens]

del texts

In [15]:
len(sentences)

2277594

In [16]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py
model_ft = FastText(sentences, size = 200, window = 5, min_count = 5)

In [17]:
print(model_ft)

FastText(vocab=211136, size=200, alpha=0.025)


In [18]:
model_ft.save("models/fasttext")

In [19]:
model_w2v = Word2Vec(sentences, size = 200, window = 5, min_count = 5)

In [20]:
print(model_w2v)

Word2Vec(vocab=211136, size=200, alpha=0.025)


In [21]:
model_w2v.save("models/w2v")

## Ennustetaan runoaineistolle lakimalliin pohjaavat uudet sanat

In [28]:
paths = ["data/cutups/" + x for x in os.listdir("data/cutups")]

In [29]:
data = []

for p in paths:
    with open(p, "r") as f:
        text = f.read()
        text = clean(text)
        text = remove_punct(text)
        
        data.append( text.split() )

In [30]:
model_w2v = Word2Vec.load("models/w2v")
model_ft = FastText.load("models/fasttext")

In [35]:
pred = [ [] for i in range(12) ]

words = 0
w2v_pred = 0
ft_pred = 0

for runo_pos in range(len(data)):
    
    runo = data[runo_pos]
    
    for w_pos in range(len(runo)):
        
        words += 1
        
        # Yritetään päätellä uusi sana käyttäen w2v-mallia
        try:
            pred[runo_pos].append(model_w2v.wv.most_similar(runo[w_pos])[0])
            w2v_pred += 1

        # Jos sana ei ole sanastossa, käytetään FastText-mallin lähintä sanaa tai alkuperäistä sanaa
        except:
            #pred[runo_pos].append((runo[w_pos], 1))
            pred[runo_pos].append(model_ft.wv.most_similar(runo[w_pos])[0])
            ft_pred += 1 
        
        # Jos malli ei tunne sanaa, käytetään FastText-mallin lähintä sanaa
        #if runo[w_pos] not in model_w2v.wv.vocab:
            #pred[runo_pos].append(model_ft.wv.most_similar(runo[w_pos])[0])
            #ft_pred += 1
        
        # Muuten käytetään alkuperäistä sanaa
        #else:
            #pred[runo_pos].append((runo[w_pos], 1))
            
        

print("Words in total: "+ str(words))
print("Words inferred with W2V model: " + str(w2v_pred))
print("Words inferred with FT model: " + str(ft_pred))

Words in total: 1085
Words inferred with W2V model: 771
Words inferred with FT model: 314


In [36]:
i = 1
for p in reversed(pred):
    
    text = ""
    prob = ""
    
    for word in p:
        text += word[0] + " "
        prob += str(word[1]) + ", "
    
    with open("models/cutups/" + str(i) + "_kaikki.txt", "w") as f:
        f.write(text + " " + prob)
    
    i += 1

## Lasketaan sanoilla

In [6]:
import random

In [7]:
model_w2v = Word2Vec.load("models/w2v")
model_ft = FastText.load("models/fasttext")

In [8]:
seeds = [
    'elämä','kuolema','taide','ilmaisu','sana','puhe',
    'runo','tyhjä', 'täysi','totuus','todellisuus','kauneus',
    'kaunis','yhteisö', 'yhteiskunta','mieli','sielu',
    'ruumiillinen','himoita','ääretön','rajaton','ajaton',
    'uskomaton','mainio','ovela','vilpitön','aikomus',
    'halu','tahto','tarve','tunne','luokkataistelu','toimeentulo',
    'laki','oikeus','hallitsija','päättäjä','koti','asunto','asuminen'
]

In [11]:
def laske_sanoilla(sanat):
    
    text = ""
    text += "Sanat: " + sanat[0] + ", " + sanat[1] + '\n\n'
    
    model = model_ft
    if (sanat[0] in model_w2v.wv.vocab) and (sanat[1] in model_w2v.wv.vocab):
        model = model_w2v
        
    text += "Malli: " + str(model) + '\n\n'
    
    for s in sanat:
        text += "Lähin merkitys sanalle '" + s + "': " + model.wv.most_similar(s)[0][0] + '\n'
    text += '\n'
        
    vec_add = np.add(model.wv[sanat[0]], model.wv[sanat[1]])
    vec_sub = np.subtract(model.wv[sanat[0]],model.wv[sanat[1]])
    
    text += sanat[0] + " + " + sanat[1] + ":" + '\n'
    pred = model.wv.most_similar(positive=[vec_add], topn=5)
    for p in pred:
        text += p[0] + ", " + str(p[1]) + '\n'
    text += '\n'
    
    text += sanat[0] + " - " + sanat[1] + ":" + '\n'
    pred = model.wv.most_similar(positive=[vec_sub], topn=5)
    for p in pred:
        text += p[0] + ", " + str(p[1]) + '\n'
    text += '\n'
    
    with open("models/laskutoimituksia/" + sanat[0] + "," + sanat[1] + ".txt", "w") as f:
        f.write(text)

In [15]:
for i in range(100):
    sanat = random.sample(seeds, 2)
    laske_sanoilla(sanat)