In [310]:
from nltk import ngrams


unigram_table = {}

def createUnigram():
    global unigram_table
    def create_unigram_table(text_data, unigram_table):
        
        unigrams = list(ngrams(text_data.split(), 1))
        for unigram in unigrams:
            if unigram in unigram_table:
                unigram_table[unigram] += 1
            else:
                unigram_table[unigram] = 1
        return unigram_table

    def read_text_data(file):
        
        for line in file:
            yield line



    with open("syllables95.txt") as f:
        for line in read_text_data(f):
            unigram_table = create_unigram_table(line, unigram_table)


    with open("unigram_table.txt", "w") as f:
        for unigram, count in unigram_table.items():
            f.write(f"{unigram}: {count}\n")



In [311]:
bigram_table = {}

def createBigram():
    global bigram_table
    def create_bigram_table(text_data, bigram_table):
        
        bigrams = list(ngrams(text_data.split(), 2))
        for bigram in bigrams:
            if bigram in bigram_table:
                bigram_table[bigram] += 1
            else:
                bigram_table[bigram] = 1
        return bigram_table

    def read_text_data(file):
        
        for line in file:
            yield line


    

    with open("syllables95.txt") as f:
        for line in read_text_data(f):
            bigram_table = create_bigram_table(line, bigram_table)

    with open("bigram_table.txt", "w") as f:
        for bigram, count in bigram_table.items():
            f.write(f"{bigram}: {count}\n")


In [312]:
trigram_table = {}

def createTrigram():
    global trigram_table
    def create_trigram_table(text_data, trigram_table):
        trigrams = list(ngrams(text_data.split(), 3))
        for trigram in trigrams:
            if trigram in trigram_table:
                trigram_table[trigram] += 1
            else:
                trigram_table[trigram] = 1
        return trigram_table

    def read_text_data(file):
        for line in file:
            yield line

    

    with open("syllables95.txt") as f:
        for line in read_text_data(f):
            trigram_table = create_trigram_table(line, trigram_table)

    with open("trigram_table.txt", "w") as f:
        for trigram, count in trigram_table.items():
            f.write(f"{trigram}: {count}\n")


In [313]:
import gensim

def create_word2vec_model(n_gram_table):
    n_grams = [[word for word in n_gram] for n_gram in n_gram_table.keys()]

    model = gensim.models.Word2Vec(n_grams, vector_size=100, window=5, min_count=1, workers=8)
    return model




In [314]:
def ngram2Vec():
    unigram_model = create_word2vec_model(unigram_table)
    bigram_model = create_word2vec_model(bigram_table)
    trigram_model = create_word2vec_model(trigram_table)

    unigram_model.save("unigram_model.bin")

    bigram_model.save("bigram_model.bin")

    trigram_model.save("trigram_model.bin")

In [315]:
import gensim.models.word2vec

def read_models_from_files():
    global unigram_model
    global bigram_model
    global trigram_model

    unigram_model = gensim.models.word2vec.Word2Vec.load("unigram_model.bin")

    bigram_model = gensim.models.word2vec.Word2Vec.load("bigram_model.bin")

    trigram_model = gensim.models.word2vec.Word2Vec.load("trigram_model.bin")


In [316]:
from scipy.stats import pearsonr

def most_similar_pairs(syl1, syl2, model):
    vec_syl12 = bigram_model.wv.get_vector(syl1) - bigram_model.wv.get_vector(syl2)


    vocab = bigram_model.wv.index_to_key 

    # Initialize a list to store the most similar word pairs
    most_similar_pairs = []

    # Iterate through all pairs of words in the vocabulary
    for word1 in vocab:
        for word2 in vocab:
            # Calculate the difference vector between the two words
            difference_vector = bigram_model.wv.get_vector(word1) - bigram_model.wv.get_vector(word2)
            
            # Calculate the similarity between the difference vector and vec_la_ri
            similarity, p_value = pearsonr(difference_vector, vec_syl12)
            
            # If the similarity is high, add the word pair to the list of most similar pairs
            if similarity > 0.95:
                most_similar_pairs.append((word1, word2, similarity))

    # Sort the list of most similar pairs by similarity in descending order
    most_similar_pairs.sort(key=lambda x: x[2], reverse=True)

    # Print the most similar word pairs and their similarity scores
    for word1, word2, similarity in most_similar_pairs:
        print(f"{word1} - {word2}: {similarity}")

In [317]:
def similarity_test(syl1, syl2, syl3, syl4):
    print(f"Similarity between {syl1}-{syl2} and {syl3}-{syl4}: ")

    vec_syl1 = unigram_model.wv.get_vector(syl1) - unigram_model.wv.get_vector(syl2)
    vec_syl2 = unigram_model.wv.get_vector(syl3) - unigram_model.wv.get_vector(syl4)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)

    print("Unigram: " + str(similarity))

    vec_syl1 = bigram_model.wv.get_vector(syl1) - bigram_model.wv.get_vector(syl2)
    vec_syl2 = bigram_model.wv.get_vector(syl3) - bigram_model.wv.get_vector(syl4)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)

    print("Bigram: " + str(similarity))

    vec_syl1 = trigram_model.wv.get_vector(syl1) - trigram_model.wv.get_vector(syl2)
    vec_syl2 = trigram_model.wv.get_vector(syl3) - trigram_model.wv.get_vector(syl4)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)

    print("Trigram: " + str(similarity), end="\n\n")

In [318]:
def similarity_test_2(syl1, syl2, syl3, syl4, syl5, syl6):
    print(f"Similarity between {syl1}-{syl2}-{syl3} and {syl4}-{syl5}-{syl6}")
    vec_syl1 = unigram_model.wv.get_vector(syl1) - unigram_model.wv.get_vector(syl2) - unigram_model.wv.get_vector(syl3)
    vec_syl2 = unigram_model.wv.get_vector(syl4) - unigram_model.wv.get_vector(syl5) - unigram_model.wv.get_vector(syl6)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)
    print("Unigram " + str(similarity))

    vec_syl1 = bigram_model.wv.get_vector(syl1) - bigram_model.wv.get_vector(syl2) - bigram_model.wv.get_vector(syl3)
    vec_syl2 = bigram_model.wv.get_vector(syl4) - bigram_model.wv.get_vector(syl5) - bigram_model.wv.get_vector(syl6)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)
    print("Bigram " + str(similarity))

    vec_syl1 = trigram_model.wv.get_vector(syl1) - trigram_model.wv.get_vector(syl2) - trigram_model.wv.get_vector(syl3)
    vec_syl2 = trigram_model.wv.get_vector(syl4) - trigram_model.wv.get_vector(syl5) - trigram_model.wv.get_vector(syl6)

    similarity, p_value = pearsonr(vec_syl1, vec_syl2)
    print("Trigram " + str(similarity), end="\n\n")

In [319]:
def similar_syllables(syl):
    print("Similar syllables to " + syl + ":")

    similar_pairs = unigram_model.wv.most_similar(syl, topn=3)

    # Print the most similar word pairs and their similarity scores
    print("Unigram: ")
    for word, similarity in similar_pairs:
        print(f"{word}: {similarity}")

    similar_pairs = bigram_model.wv.most_similar(syl, topn=3)

    # Print the most similar word pairs and their similarity scores
    print("Bigram: ")
    for word, similarity in similar_pairs:
        print(f"{word}: {similarity}")

    similar_pairs = trigram_model.wv.most_similar(syl, topn=3)

    # Print the most similar word pairs and their similarity scores
    print("Trigram: ")
    for word, similarity in similar_pairs:
        print(f"{word}: {similarity}")

    print("\n")
    

In [325]:



createUnigram()

createBigram()

createTrigram()


ngram2Vec()


read_models_from_files()


print("---\nPart 4: Find similar syllables\n---")

print("ri from odalari")
similar_syllables("ri")

print("ler from geldiler")
similar_syllables("ler")

print("de from bizdeki")
similar_syllables("de")

print("ma from almadı")
similar_syllables("ma")

print("yan from almayan")
similar_syllables("yan")



print("---\nPart 5: Morphology analogy tests\n---")

print("Example: odalari - odalarim")
similarity_test("la", "ri", "la", "rim")

print("Example: geldiler aldılar")
similarity_test("di", "ler", "di", "lar")

print("Example: bizdeki ondaki")
similarity_test("de", "ki", "da", "ki")

print("Example: almadi vermedi")
similarity_test("ma", "di", "me", "di")

print("Example: almayan gitmeyen")
similarity_test("ma", "yan", "me", "yen")



print("---\nPart 5 Bonus: Word analogy tests\n---")

similarity_test("a", "dam", "ka", "din")

similarity_test_2("is", "pan", "ya", "por", "te", "kiz")

similarity_test_2("mer", "ce", "des", "to", "yo", "ta")

similarity_test("as", "lan", "ke", "di")








---
Part 4: Find similar syllables
---
ri from odalari
Similar syllables to ri:
Unigram: 
yaz: 0.2998822331428528
mog: 0.2973637878894806
ark: 0.29700127243995667
Bigram: 
ra: 0.9984901547431946
te: 0.9983323216438293
re: 0.9983175992965698
Trigram: 
ni: 0.7476686835289001
re: 0.6885415315628052
ra: 0.6720702648162842


ler from geldiler
Similar syllables to ler:
Unigram: 
mond: 0.37578514218330383
ik: 0.32739895582199097
cit: 0.30533266067504883
Bigram: 
ne: 0.9978345036506653
lar: 0.9975080490112305
na: 0.9974647760391235
Trigram: 
le: 0.6296705007553101
len: 0.6084480881690979
den: 0.5954577326774597


de from bizdeki
Similar syllables to de:
Unigram: 
law: 0.33243757486343384
ment: 0.31676486134529114
lun: 0.29563137888908386
Bigram: 
le: 0.9989104270935059
e: 0.9988452792167664
in: 0.9984219670295715
Trigram: 
den: 0.7939417362213135
dey: 0.6487324237823486
e: 0.6194984912872314


ma from almadı
Similar syllables to ma:
Unigram: 
dri: 0.2908521592617035
miss: 0.28723376989364624
m