In [5]:
import pandas as pd
from nltk import ConditionalFreqDist, bigrams, trigrams
from sklearn.feature_extraction.text import TfidfVectorizer
from typing_assistant.indexing import Collection, InvertedIndex, Lexicon
from typing_assistant.config import config
from typing_assistant.context import Context

In [6]:
df = pd.read_csv('../data/captions_0.1.tsv', sep='\t', index_col='id')[: 10000]
corpus = df['caption'].to_dict()
len(corpus)

10000

In [8]:
context = Context(config.ROOT)
collection = Collection()
collection.build_collection(corpus)
inv_index = InvertedIndex(collection)
inv_index.index_collection()
lexicon = Lexicon(context)
lexicon.build_lexicon(collection, inv_index)

In [10]:
vectorizer = TfidfVectorizer(token_pattern=r'\w+', ngram_range=(1, 1), analyzer='word')
tfidf_matrix = vectorizer.fit_transform([sentence for sentence in corpus.values()])
features = vectorizer.get_feature_names()
sums = tfidf_matrix.sum(axis=0)
sorted({term: sums[0, col] for col, term in enumerate(features)}.items(), key=lambda x: x[1], reverse=True)[: 100]

[('the', 624.2778225105985),
 ('a', 531.1542926055173),
 ('of', 394.1603911971106),
 ('on', 325.6256597061032),
 ('in', 311.4396563250166),
 ('and', 277.02504262038576),
 ('person', 239.60072866678902),
 ('with', 236.04490675312425),
 ('at', 194.4727102813845),
 ('during', 192.05227336830671),
 ('for', 190.9879733406835),
 ('to', 179.52116686604128),
 ('actor', 161.74788920480472),
 ('background', 138.26573749406876),
 ('artist', 132.5163431426272),
 ('premiere', 130.63877180862312),
 ('is', 130.49600013928386),
 ('white', 120.80534603908025),
 ('from', 112.50375825426137),
 ('this', 109.86143591209326),
 ('player', 100.16712169938268),
 ('attends', 99.68427279960339),
 ('an', 97.80490121337738),
 ('by', 97.53337655385236),
 ('stage', 91.09506574400048),
 ('illustration', 90.02114670412753),
 ('vector', 83.98487550812928),
 ('image', 80.97412097370358),
 ('football', 80.8138660504819),
 ('view', 80.14739111571137),
 ('performs', 78.31032740277381),
 ('s', 73.02787555373345),
 ('team', 

In [60]:
vectorizer = TfidfVectorizer(token_pattern=r'\w+', max_df=0.2)
tfidf_matrix = vectorizer.fit_transform([sentence for sentence in corpus.values()])
features = vectorizer.get_feature_names()
sums = tfidf_matrix.sum(axis=0)
sorted({term: sums[0, col] for col, term in enumerate(features)}.items(), key=lambda x: x[1], reverse=True)[: 100]

[('and', 280.9012727623414),
 ('person', 243.91465968927469),
 ('with', 239.62885185375404),
 ('at', 197.31822368516123),
 ('during', 194.7700961545337),
 ('for', 193.1056695251096),
 ('to', 181.43211625196338),
 ('actor', 164.10174689772478),
 ('background', 142.5836135984406),
 ('artist', 134.30028440724584),
 ('premiere', 132.6043400506801),
 ('is', 132.32388368419396),
 ('white', 124.28205295845505),
 ('from', 114.32548066873207),
 ('this', 110.95860701546009),
 ('player', 101.35686994622776),
 ('attends', 100.87555246731038),
 ('an', 98.9910616025526),
 ('by', 98.66062159940832),
 ('stage', 93.3660614558754),
 ('illustration', 92.52644554160447),
 ('vector', 86.14206816487979),
 ('image', 82.7210033923051),
 ('view', 82.16815225290357),
 ('football', 81.82472556170704),
 ('performs', 79.79028103821334),
 ('s', 73.89676444242744),
 ('team', 73.49238306995603),
 ('i', 72.48511159383601),
 ('as', 68.72705635284484),
 ('festival', 66.8523124801337),
 ('his', 66.53135577324018),
 ('pla