In [1]:
from collections import Counter
import pandas as pd
import math

In [2]:
def tf_idf(corpus):
    def tf(text):
        tf_val = Counter(text)
        for token in tf_val:
            tf_val[token] = tf_val[token] / float(len(text))
            
        return tf_val
    
    def idf(corpus_texts):
        idf_val = {}
        tokens = sorted(set([token for text in corpus_texts for token in text]))
        for token in tokens:
            idf_val[token] = math.log10(len(corpus_texts) /
                                        float(sum([1 for text in corpus_texts if token in text])))
        
        return idf_val
    
    corpus_texts = []
    
    for category in corpus.columns.levels[0]:
        for i, item in enumerate(corpus[category]['Text']):
            corpus_texts.append(item.split())
    
    corpus_val = []
    
    idf_val = idf(corpus_texts)
    
    for text in corpus_texts:
        tf_idf_val = {}
        tf_val = tf(text)
        
        for word in tf_val:
            tf_idf_val[word] = tf_val[word] * idf_val[word]
            
        corpus_val.append(tf_idf_val)
    
    return corpus_val

In [2]:
data = pd.read_csv("laba2.csv", header=[0, 1], index_col=0, encoding='utf-8')

In [4]:
tf_idf_val = tf_idf(data)

In [5]:
tf_idf_val[0]

{'4455': 0.012418300645007992,
 '3692': 0.02248940839915431,
 '830': 0.024742258017892274,
 '871': 0.03006322599294705,
 '2281': 0.010100023222004177,
 '3509': 0.01961192194563784,
 '2011': 0.017226451519655513,
 '1827': 0.013544487209739314,
 '3471': 0.01961192194563784,
 '4274': 0.05883576583691351,
 '2787': 0.024742258017892274,
 '530': 0.009181418971201743,
 '872': 0.024742258017892274,
 '363': 0.013138158598025338,
 '18': 0.024742258017892274,
 '406': 0.021505376344086023,
 '127': 0.024742258017892274,
 '245': 0.024742258017892274,
 '1397': 0.03275008054366318,
 '4284': 0.0989690320715691,
 '1723': 0.018268494670279772,
 '2002': 0.01637504027183159,
 '5619': 0.013138158598025338,
 '5551': 0.012096115447401076,
 '2202': 0.007070723862126816,
 '4660': 0.012418300645007992,
 '505': 0.020200046444008354,
 '2464': 0.07422677405367681,
 '2024': 0.0989690320715691,
 '1863': 0.013544487209739314,
 '4283': 0.019421290042837498,
 '5450': 0.012418300645007992,
 '5282': 0.012057361423005106,


In [6]:
print(max([max(text.values()) for text in tf_idf_val]))
print(min([min(text.values()) for text in tf_idf_val]))

0.19175249963866509
0.0010549953123676148


In [3]:
data

None,политика,политика,кино,кино,технологии,технологии,культура,культура
None,Title,Text,Title,Text,Title,Text,Title,Text
0,4045 3180 953 945 4676,1182 1259 1504 954 5604 4675 5579 4677 4856 91...,2464 2024 4154 1546 1839 5282 1487 871,4455 3692 830 871 2281 3509 2011 1827 3471 427...,442 418 3692 4533 1851,1967 1937 2257 4517 2735 3107 4478 418 5168 32...,5142 4261 4755 1839 3801 3757 2675 5288,4258 2700 3844 4070 1170 3361 3644 1812 4860 2...
1,4045 3609 3460 3115 1297,1857 1182 4192 3759 3115 2327 3175 3462 4788 5...,4455 3692 5029 2933 5455 5512 871,443 3585 4614 3439 2065 2815 3585 4526 3691 29...,2596 815 1742,4398 4981 653 752 5232 1572 3136 5649 1984 60 ...,5592 3265 5175 945 1166 5412 1049 964,4575 2356 4867 651 1079 4324 4782 5041 3644 18...
2,1184 2072 4045 2726 3314 3155 5124 4796,3314 1481 1106 4283 3155 5124 1259 1776 3773 9...,1783 485 1398 2648 1069 4373 3897 5540,5251 3619 505 1043 758 3007 5097 4901 3668 530...,344 3351 4064 4756 208,4117 4436 5430 1439 1776 3782 1313 3462 2145 2...,2992 3585 4613,443 3585 4614 3439 2065 2815 3585 4526 3691 29...
3,4045 552 3328 3319 4517 2834,3773 4319 910 4045 4517 2834 1028 3787 577 292...,5588 391 929 2418 4206,3942 3203 5550 2366 4134 4064 3619 4444 4801 1...,3692 4059 3584 179 295 1593 1385 5585,3685 3903 1677 3491 3056 2985 1437 1242 5662 5...,5142 4206 3918 4710 2948 2471 1443 2021,4025 505 3960 3599 1350 2933 583 2560 3688 521...
4,4045 3183 981 5385 3706 2596,3773 4295 910 4045 5369 733 3795 5475 3183 981...,2992 3585 4613,2933 2189 1787 964 80 398 257 3496 1309 5252 4...,4534 3054 5256 2726 2441 750 4296,2397 3489 3168 320 3056 4062 1039 21 336 266 2...,2485 5519 4573 2150 3369 338 130 5669 871,3619 4294 4962 3220 4556 4878 5519 917 1543 18...
5,2992 3585 4613,4294 2397 910 4045 2726 4348 1184 4593 5221 11...,1397 734 3738 4398 5018,443 3585 4614 3439 2065 2815 3585 4526 3691 29...,2992 3585 4613,2612 5592 4298 5405 4391 2850 2622 3106 1837 1...,3321 4254 4154 5251 2933 521,4520 3321 1785 3686 2684 2281 3391 711 2495 35...
6,3556 3357 1183 5302 3795 4045,2552 1442 5441 2595 4294 2397 5183 3183 982 37...,2186 3557 4308 4766 2397 4294 3961,443 3585 4614 3439 2065 2815 3585 4526 3691 29...,111 2811 1358 4546 5269 2907,4679 4661 1526 4546 5428 3584 4901 4718 4679 7...,2591 3454 1998 2222 2441 2682,443 3585 4614 3439 2065 2815 3585 4526 3691 29...
7,2817 4475 768 4326 5592 3265 1263 4045 1421,4294 2397 3561 3209 3759 4546 3691 657 3577 46...,2992 3585 4613,4860 2933 5282 1481 2757 2188 5551 4828 859 40...,2726 2017 179 21 1087,461 4241 5103 2343 1261 3771 4517 1212 1261 29...,3636 4930 3630 1744 3564 3038 5655,4520 1696 2591 3276 4772 1945 1742 3325 1351 4...
8,3984 2850 3554 4172 4458 4045 5483 1719,4294 2397 3024 1624 1514 5513 4857 5403 1659 4...,5571 2263 2726 3056 3416 5282,2111 1623 363 154 309 1048 3416 5282 5221 1209...,2940 21 3963 4484 1193 179 295,894 21 566 2973 1558 4259 3183 1592 3541 2933 ...,2169 4296 1048 5381 505,1269 4796 1048 2687 2402 4824 1715 3087 762 42...
9,3773 3713 3370 1776 4045 703 5281 5349,3797 3773 4319 1443 3370 4154 2659 3187 588 28...,5282 485 2397 530 3961,4403 528 5254 717 1069 4294 5263 2128 485 1638...,424 3761 2644 4534 54 1267 2269,778 2626 5096 3139 5399 2330 1098 4119 5586 37...,741 1184 2697 4796 3428 5381 3449 4273,2776 581 4295 2608 741 3223 1350 4385 4273 371...


In [4]:
with open("common.dict", "r") as f:
    tokens = {line.split()[0]: line.split()[2] for line in f}

In [5]:
import collections

In [6]:
invert_index = collections.defaultdict(list)

for token, i in tokens.items():
    for category in data.columns.levels[0]:
        for index, item in enumerate(data[category]['Text']):
            if i in item.split():
                invert_index[token].append((category, index))

In [21]:
invert_index

defaultdict(list,
            {'кардашьян': [('технологии', 40)],
             'нормальный': [('культура', 22), ('технологии', 8)],
             'ольга': [('кино', 39), ('культура', 8)],
             'рбкактёр': [('кино', 18)],
             'иметься': [('технологии', 0), ('технологии', 22)],
             'повышенный': [('технологии', 30)],
             'хабаровский': [('политика', 23)],
             'многолетний': [('политика', 42)],
             'статуэтка': [('культура', 29)],
             'шнуров': [('культура', 47)],
             'табаковезамкнуть': [('технологии', 20)],
             'региональный': [('политика', 6), ('политика', 7)],
             'сейчасть': [('политика', 34)],
             'система': [('кино', 17),
              ('кино', 22),
              ('политика', 11),
              ('политика', 35),
              ('технологии', 0),
              ('технологии', 1),
              ('технологии', 2),
              ('технологии', 30),
              ('технологии', 35),
          

In [24]:
with open("invert.index", "w") as f:
    for token, key in invert_index.items():
        f.write("{} -> {}\n".format(token, key))

In [None]:
for key, item in dictionaries.items():
    with codecs.open("{}.dict".format(key), "w", "utf-8") as f:
        for word, val in item.items():
            f.write(u"{} -> {}\n".format(word, val))