In [1]:
from gensim.models import Word2Vec, KeyedVectors

from nltk.collocations import *
from nltk.tokenize.regexp import regexp_tokenize
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
nltk.download('punkt')

import pandas as pd
from collections import Counter
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
pathtomodel ="../../../outputs/All_BNF/aligned/all.model"
pathtodata = "../../../data/all_data/all.txt"
model = Word2Vec.load(pathtomodel)

list_of_terms = ["vache",
                "veau",
                "taurillon",
                "genisse",
                "taureau",
                "boeuf",
                "bestiaux"]

# Vocab + frequencies

In [3]:
vocab_list = list(model.wv.key_to_index.keys())
vocab_frequencies = {word: model.wv.get_vecattr(word, "count") for word in vocab_list}

In [17]:
vocab_df = pd.DataFrame.from_dict(vocab_frequencies, "index", columns = ["freq"]).reset_index()
vocab_df = vocab_df.rename(columns = {"index":"vocab"})

# n-grams + collocations

In [4]:
with open(pathtodata, "r", encoding = "utf-8") as f :
    text = f.read()
print("text loaded")
text = re.sub(r'[^\w\s]', '', text)
text_tokens = regexp_tokenize(text, pattern='\s+', gaps =True)

text loaded


In [5]:
# Find bigrams
bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

finder_bi = BigramCollocationFinder.from_words(text_tokens)
print("Bigrams found")
finder_tri = TrigramCollocationFinder.from_words(text_tokens)
print("Trigrams found")

# Apply frequency filter (only bigrams that appear at least 2 times)
finder_bi.apply_freq_filter(3)
finder_tri.apply_freq_filter(4)

Bigrams found
Trigrams found


In [6]:
bigrams_info = sorted(finder_bi.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
trigrams_info = sorted(finder_tri.ngram_fd.items(), key=lambda t: (-t[1], t[0]))

In [7]:
bigrams_df = pd.DataFrame(bigrams_info, columns = ["bigram", "freq"])
trigrams_df = pd.DataFrame(trigrams_info, columns = ["trigram", "freq"])

In [8]:
bigrams_df[["term1", "term2"]] = bigrams_df['bigram'].apply(pd.Series)
trigrams_df[["term1", "term2", "term3"]] = trigrams_df["trigram"].apply(pd.Series)


In [33]:
bigrams_vaches = bigrams_df.loc[(bigrams_df["term1"].isin(list_of_terms)) | (bigrams_df["term2"].isin(list_of_terms))].reset_index()
trigrams_vaches = trigrams_df.loc[(trigrams_df["term1"].isin(list_of_terms)) | (trigrams_df["term2"].isin(list_of_terms)) | (trigrams_df["term3"].isin(list_of_terms))].reset_index()

In [34]:
trigrams_vaches

Unnamed: 0,index,trigram,freq,term1,term2,term3
0,10813,"(une, vache, et)",33,une,vache,et
1,13654,"(de, lad, vache)",27,de,lad,vache
2,18408,"(que, les, bestiaux)",22,que,les,bestiaux
3,24051,"(une, vache, deux)",18,une,vache,deux
4,25591,"(que, lad, vache)",17,que,lad,vache
...,...,...,...,...,...,...
116,187674,"(vache, dune, part)",4,vache,dune,part
117,187675,"(vache, et, de)",4,vache,et,de
118,187676,"(vache, et, que)",4,vache,et,que
119,187728,"(veau, de, lait)",4,veau,de,lait


# Output

In [48]:
vocab_df.to_csv("vocab_freq.csv")
bigrams_df.to_csv("bigrams_freq.csv")
trigrams_df.to_csv("trigrams_freq.csv")

bigrams_vaches.to_csv("bigrams_vaches.csv")
trigrams_vaches.to_csv("trigrams_vaches.csv")

list_of_info = [f"Longueur du vocabulaire - Modèle Word2Vec : {len(vocab_list)}",
                f"Nombre de tokens - Obtenu en additionnant les fréquences du vocabulaire : {sum(vocab_frequencies.values())}",
                f"Nombre de tokens - Obtenu avec nltk.regexp_tokenize : {len(text_tokens)}"]

with open("general_info_BNF_corpus.txt", "w", encoding = "utf-8") as ff :
    for info in list_of_info :
        ff.write(info)
        ff.write("\n")
