In [1]:
from gensim.models import Word2Vec, KeyedVectors

from nltk.collocations import *
from nltk.tokenize.regexp import regexp_tokenize
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
nltk.download('punkt')

import pandas as pd
from collections import Counter
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
pathtomodel ="../../../outputs/All_BNF/aligned/all.model"
pathtodata = "../../../data/all_data/all.txt"
model = Word2Vec.load(pathtomodel)

list_of_terms = ["vache", "vaches",
                "veau", "veaux",
                "taurillon",
                "genisse", "genisses",
                "taureau", "taureaux", "taureaus",
                "boeuf", "boeufs",
                "bestiaux",
                "immortelles",
                "immortelle",
                "immortel","immortels",
                "imortel, imortels, imortelles, imortelle"]

# Vocab + frequencies

In [3]:
vocab_list = list(model.wv.key_to_index.keys())
vocab_frequencies = {word: model.wv.get_vecattr(word, "count") for word in vocab_list}

In [4]:
vocab_df = pd.DataFrame.from_dict(vocab_frequencies, "index", columns = ["freq"]).reset_index()
vocab_df = vocab_df.rename(columns = {"index":"vocab"})

# n-grams + collocations

In [5]:
with open(pathtodata, "r", encoding = "utf-8") as f :
    text = f.read()
print("text loaded")
text = re.sub(r'[^\w\s]', '', text)
text_tokens = regexp_tokenize(text, pattern='\s+', gaps =True)

text loaded


In [13]:
def ngrams_builder(ngram, tokens, filter) :
    if ngram == 2 :
       ngram_measures = BigramAssocMeasures()
       finder = BigramCollocationFinder.from_words(text_tokens)
    elif ngram == 3 :
        ngram_measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(text_tokens)
    print("ngrams found")
    finder.apply_freq_filter(filter)

    ngrams_freq = finder.ngram_fd.items()
    scores = finder.score_ngrams(ngram_measures.pmi)
    print("filter applied, score and freq calculated")
    ngrams_df = pd.DataFrame(ngrams_freq, columns = [f"{ngram}gram", "freq"])
    score_df= pd.DataFrame(scores,  columns = [f"{ngram}gram", "score"])
    ngrams_df = pd.merge(ngrams_df, score_df, on = f"{ngram}gram")

    ngrams_df[[f"term{i+1}" for i in range(ngram)]] = ngrams_df[f'{ngram}gram'].apply(pd.Series)   
    print(f"{ngram}grams Done")
    return ngrams_df

In [14]:
bigrams = ngrams_builder(2, text_tokens, 3)
trigrams = ngrams_builder(3, text_tokens, 4)

ngrams found
filter applied, score and freq calculated
2grams Done
ngrams found
filter applied, score and freq calculated
3grams Done


In [18]:
bigrams_vaches = bigrams.loc[(bigrams["term1"].isin(list_of_terms)) | (bigrams["term2"].isin(list_of_terms))].reset_index()
trigrams_vaches = trigrams.loc[(trigrams["term1"].isin(list_of_terms)) | (trigrams["term2"].isin(list_of_terms)) | (trigrams["term3"].isin(list_of_terms))].reset_index()

# Output

In [21]:
vocab_df.to_csv("vocab_freq.csv")
bigrams.to_csv("bigrams.csv")
trigrams.to_csv("trigrams.csv")

bigrams_vaches.to_csv("bigrams_vaches.csv")
trigrams_vaches.to_csv("trigrams_vaches.csv")

list_of_info = [f"Longueur du vocabulaire - Modèle Word2Vec : {len(vocab_list)}",
                f"Nombre de tokens - Obtenu en additionnant les fréquences du vocabulaire : {sum(vocab_frequencies.values())}",
                f"Nombre de tokens - Obtenu avec nltk.regexp_tokenize : {len(text_tokens)}"]

with open("general_info_BNF_corpus.txt", "w", encoding = "utf-8") as ff :
    for info in list_of_info :
        ff.write(info)
        ff.write("\n")
