In [56]:
import re
from itertools import chain
import pandas as pd
from nltk import ConditionalFreqDist, bigrams, trigrams
from sklearn.feature_extraction.text import TfidfVectorizer
from typing_assistant.indexing import Collection, InvertedIndex, Lexicon
from typing_assistant.config import config
from typing_assistant.context import Context

In [13]:
df = pd.read_csv('../data/captions_0.1.tsv', sep='\t', index_col='id')[: 10000]
corpus = df['caption'].to_dict()
len(corpus)

10000

In [6]:
context = Context(config.ROOT)
collection = Collection()
collection.build_collection(corpus)
inv_index = InvertedIndex(collection)
inv_index.index_collection()
lexicon = Lexicon(context)
lexicon.build_lexicon(collection, inv_index)

In [58]:
vectorizer = TfidfVectorizer(token_pattern=r'[a-z]+')
tfidf_matrix = vectorizer.fit_transform([sentence for sentence in corpus.values()])
features = vectorizer.get_feature_names_out()
sums = tfidf_matrix.sum(axis=0)
sorted({term: sums[0, col] for col, term in enumerate(features)}.items(), key=lambda x: x[1], reverse=True)[: 10]

[('the', 626.1143018026254),
 ('a', 532.32093668446),
 ('of', 395.3645231015813),
 ('on', 326.1175632198149),
 ('in', 312.0174692113382),
 ('and', 277.39850711831275),
 ('person', 240.17365452254754),
 ('with', 236.28727584463584),
 ('at', 195.00368963448693),
 ('during', 192.53522454662016)]

In [78]:
bgs = [*chain(*([*bigrams(document.tokens)] for document in collection.documents))]

In [82]:
a = ConditionalFreqDist(list(bgs))
print(a['beach'])

<FreqDist with 41 samples and 79 outcomes>
