In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("data/news.csv")

In [2]:
df.head()

Unnamed: 0,david,algorithm,classifi,mykotronx,data,cryptosystem,de,clear,nsa,traffic,...,leviticu,adulter,theolog,genesi,obei,dealt,abraham,kiefer,convict,dishonest
0,1,3,4,2,1,4,3,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# How are entries sorted in matrix?

 This is a background information I would like to have for task 1b.
 By looking into the terms constituting the df's columns, I can conlude the order of the 4 know topics (they are also given in the source code).
 I would however like to know for each topic block in which order the documents appear.
 
 After some trial and error, this is the logic I reverse-engineered:
 
**I - Logic on Topic Level**
 1. Calculate for each topic block the number of terms that occur in this topic block.
 
     1.1. Select the topic with the smallest number of distinct terms as first topic.
     
     
 2. Calculate the number of intersecting/different terms for each of the remaining topics
 
     2.1. Sort the remaining topics by the number of intersecting terms and in descending order
 
**II - Logic on Document Level**
 1. Given a topic and the set of all terms it contains, determine for each of its documents the set of the terms it contains
 2. Sort the documents in ascending order: i.e. start with the document whose set of terms has the least overlap with all the of the topic's terms.
 


## I - Logic on Topic Level

In [3]:
terms_per_topic_block = {}

for i in np.arange(0,400,100):
    df_tmp = df.iloc[i:i+100]
    terms_in_topic_block = set(df.columns[df_tmp.sum(axis=0)>0])
    terms_per_topic_block[int(i/100)]= terms_in_topic_block
    print(f"\nHow many different terms are being covered by topic block {int(i/100)}?")
    print(((df_tmp>0).sum(axis=0)>0).sum())



How many different terms are being covered by topic block 0?
383

How many different terms are being covered by topic block 1?
488

How many different terms are being covered by topic block 2?
451

How many different terms are being covered by topic block 3?
454


In [4]:
# we observe an ordering
for i in range(4):
    print(len(terms_per_topic_block[0].intersection(terms_per_topic_block[i])))

383
262
240
238


In [5]:
# we observe an ordering
for i in range(4):
    print(len(terms_per_topic_block[0].difference(terms_per_topic_block[i])))

0
121
143
145


In [6]:
# cumulative summary statistics
# needed to derive coordinates for boxes that are being additionally plotted
terms_per_topic_block = {}
for i in np.arange(0,400,100):
    df_tmp = df.iloc[:i+100]
    terms_in_topic_block = set(df.columns[df_tmp.sum(axis=0)>0])
    terms_per_topic_block[int(i/100)]= terms_in_topic_block
    print(f"\nHow many different terms are being covered by topic block {int(i/100)} and topic blocks below?")
    print(((df_tmp>0).sum(axis=0)>0).sum())




How many different terms are being covered by topic block 0 and topic blocks below?
383

How many different terms are being covered by topic block 1 and topic blocks below?
609

How many different terms are being covered by topic block 2 and topic blocks below?
718

How many different terms are being covered by topic block 3 and topic blocks below?
800


## II - Logic on Document Level (i.e. given a topic)

In [247]:
# let's look into one topic block
terms_per_doc_in_topic_block = {}

for i in range((len(df_tmp))):
    tmp_s = set(df.columns[df_tmp.iloc[:i].sum(axis=0)>0])
    terms_per_doc_in_topic_block[i] = tmp_s
for i in range(100-1):
    #print(terms_per_doc_in_topic_block[i].issubset(terms_per_doc_in_topic_block[i+1])) # as expected always False
    #print(len(terms_per_doc_in_topic_block[i].difference(terms_per_doc_in_topic_block[i+1]))) 
    print(len(terms_per_doc_in_topic_block[i].intersection(df_tmp))) 


0
37
74
89
108
132
150
154
157
183
187
194
211
212
219
221
226
226
229
236
240
243
248
251
256
271
271
275
275
278
287
287
290
290
295
296
297
299
301
303
303
303
303
305
306
307
309
309
311
315
317
318
333
333
333
333
335
336
336
336
337
337
337
337
337
337
341
354
354
354
354
355
356
356
356
356
359
359
361
361
361
364
366
368
368
368
370
371
372
373
376
377
377
379
379
379
379
382
383
