In [33]:
import json
import math

with open('inverted_index.json', 'r') as f:
    inverted_index = json.load(f)

# The number of documents in the collection
num_docs = len(set(doc_id for postings_list in inverted_index.values() for doc_id in postings_list))

# The number of terms in the collection
num_terms = len(inverted_index)

# The total number of occurrences of all terms in the collection
total_term_freq = sum(len(postings_list) for postings_list in inverted_index.values())

# The average document length (in terms)
avg_doc_len = total_term_freq / num_docs

# The document frequency (DF) and inverse document frequency (IDF) for each term
dfs = {}
idfs = {}
for term, postings_list in inverted_index.items():
    dfs[term] = len(postings_list)
    idfs[term] = math.log(num_docs / dfs[term])


print(f'Number of documents: {num_docs}')
print(f'Number of terms: {num_terms}')
print(f'Total term frequency: {total_term_freq}')
print(f'Average document length: {avg_doc_len:.2f}')
print('Top 10 Term Frequencies:')
for term, df in sorted(dfs.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f'{term}: {df}')

# print(idfs)
print('Top 10 Inverse Document Frequencies:')
for term, idf in sorted(idfs.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f'{term}: {idf:.2f}')



Number of documents: 475
Number of terms: 40169
Total term frequency: 347727
Average document length: 732.06
Top 10 Term Frequencies:
one: 432
time: 409
like: 384
look: 368
see: 360
go: 359
day: 354
get: 354
back: 352
said: 347
Top 10 Inverse Document Frequencies:
adlertxt: 6.16
dunubian: 6.16
carbonium: 6.16
belitsheri: 6.16
prisca: 6.16
unmention: 6.16
commitedli: 6.16
fadler: 6.16
bragi: 6.16
celin: 6.16


In [31]:
from collections import defaultdict

idf_counts = defaultdict(int)

for term, idf in idfs.items():
    idf_counts[idf] += 1


print('IDF statistic:')
for idf, count in sorted(idf_counts.items(), reverse=True):
    print(f'{count} terms have an IDF of {idf:.2f}')


IDF statistic:
22509 terms have an IDF of 6.16
4728 terms have an IDF of 5.47
2149 terms have an IDF of 5.06
1404 terms have an IDF of 4.78
993 terms have an IDF of 4.55
795 terms have an IDF of 4.37
621 terms have an IDF of 4.22
524 terms have an IDF of 4.08
428 terms have an IDF of 3.97
391 terms have an IDF of 3.86
318 terms have an IDF of 3.77
273 terms have an IDF of 3.68
251 terms have an IDF of 3.60
223 terms have an IDF of 3.52
194 terms have an IDF of 3.46
188 terms have an IDF of 3.39
168 terms have an IDF of 3.33
132 terms have an IDF of 3.27
132 terms have an IDF of 3.22
123 terms have an IDF of 3.17
140 terms have an IDF of 3.12
109 terms have an IDF of 3.07
107 terms have an IDF of 3.03
103 terms have an IDF of 2.99
92 terms have an IDF of 2.94
83 terms have an IDF of 2.91
76 terms have an IDF of 2.87
96 terms have an IDF of 2.83
56 terms have an IDF of 2.80
71 terms have an IDF of 2.76
68 terms have an IDF of 2.73
69 terms have an IDF of 2.70
73 terms have an IDF of 2.67