## Лаба 5. Определение принадлежности текстов к заданной тематике

### Задача

Имея набор текстов определенной базовой тематики и набор текстов неизвестной тематики определить, относятся ли тексты к заданной тематике или нет.

Схожесть вакансий может использоваться в рамках content-based рекомендательной системы.



### Read docs

In [2]:
from bs4 import BeautifulSoup
import pymorphy2
from nltk.corpus import stopwords
import re

# creating list of russian stopwords
morph = pymorphy2.MorphAnalyzer()
stop_words_rus = set(stopwords.words('russian'))

# general list of docs
in_texts = []

# bringing each word to normal form and weeding out stopwords 
def normalise(word):
    normal = morph.parse(word)[0].normal_form
    if normal not in stop_words_rus:
        return normal
    else:
        return ''

# regular expression pattern for separating optional characters
rx = re.compile('[\W_]+')
    
# reading base files and text clearing
def files_reader(num_files, file_mask):
    for num in range(1,num_files):
        with open(file_mask+str(num)+'.txt') as file:
            t = ''
            for line in file:
                t += line.strip().lower()
        tt = ' '.join(normalise(rx.sub(r' ', x)) for x in BeautifulSoup(t, "lxml").text.split())
        in_texts.append(' '.join(tt.split()))

files_reader(21, 'lab05/base_')
files_reader(3961, 'lab05/test_')
      

In [3]:
in_texts

['искать прекрасный программист 1с дружный отдел сотрудник сфера it неутомительный режим работы 1с 8 2 8 3 бухгалтерия строительный организации жкх зуп самописные делать доработать существующий конфигурации разработать новое конфигураций отчеты база данных осуществлять поддержка наш продвинуть пользователь режим работа 10 18 вторник пятница строго',
 'предлагать уникальный возможность присоединиться команде разрабатывать выпускать следующий поколение корпоративный антивирусный продуктов отвести важный роль разработка основный продукт смежный инфраструктурный проект кастомизиций прийтись заниматься участие разработка основной линейка антивирусный продукт лаборатория касперский работа ключевой команда опытный разработчик лк нужно знать уметь отличный знание c net framework отличный знание c опыт практический использование win32 понимание парадигма ооп умение разбираться чужое код технический английский ответственность исполнительность инициативность внимание деталям умение самостоятельно

### Text processing

In [4]:
#counting words n-grams or character sequences
from sklearn.feature_extraction.text import CountVectorizer 
#эта штука превращает подсчитанныее вектора в частотны
from sklearn.feature_extraction.text import TfidfTransformer 

In [5]:
vectorizer = CountVectorizer() #указываем, что текст мы будем по буквам разбирать

In [6]:
cv = vectorizer.fit_transform(in_texts) #создаем словарь и все матрицы

In [7]:
# sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1],reverse=True)

In [8]:
cv

<3980x21242 sparse matrix of type '<class 'numpy.int64'>'
	with 463836 stored elements in Compressed Sparse Row format>

In [9]:
# pd.DataFrame(cv.toarray())

### Term Frequency times Inverse Document Frequency

In [10]:
transformer = TfidfTransformer(smooth_idf=False)

In [11]:
matrix = transformer.fit_transform(cv.toarray()).toarray()

### Normalize matrix and clear diagonal

In [15]:
from sklearn.preprocessing import normalize
from scipy.sparse import spdiags
import pandas as pd

In [13]:
# косинусная мера вычисляется как отношение скалярного произведения векторов(числитель) 
# к произведению длины векторов(знаменатель)

# нормализуем исходную матрицу 
# (данное действие соответствует приведению знаменателя в формуле косинусной меры к 1)
normalized_matrix = normalize(matrix)

# вычисляем скалярное произведение
cosine_sim_matrix = normalized_matrix.dot(normalized_matrix.T)

# обнуляем диагональ, чтобы исключить ее из рекомендаций
# быстрое обнуление диагонали
diag = spdiags(-cosine_sim_matrix.diagonal(), [0], *cosine_sim_matrix.shape, format='csr')
cosine_sim_matrix = cosine_sim_matrix + diag

### Create result DataFrame

In [16]:
# result matrix
resdf = pd.DataFrame(cosine_sim_matrix)

In [17]:
resdf.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3970,3971,3972,3973,3974,3975,3976,3977,3978,3979
0,0.0,0.001676,0.003962,0.024998,0.0,0.011783,0.0,0.006462,0.006462,0.006462,...,0.005267,0.016752,0.070672,0.004255,0.151952,0.034624,0.016812,0.02358,0.004638,0.004023
1,0.001676,0.0,0.0,0.083132,0.0,0.022269,0.0,0.038348,0.038348,0.038348,...,0.019339,0.037703,0.051481,0.071179,0.00958,0.034444,0.036446,0.032594,0.007745,0.043386
2,0.003962,0.0,0.0,0.039379,0.668968,0.026204,0.691258,0.027158,0.027158,0.027158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003979
3,0.024998,0.083132,0.039379,0.0,0.057625,0.445645,0.061034,0.096106,0.096106,0.096106,...,0.048179,0.054169,0.118054,0.051145,0.038257,0.057121,0.052421,0.064592,0.020082,0.043584
4,0.0,0.0,0.668968,0.057625,0.0,0.03542,0.909532,0.027817,0.027817,0.027817,...,0.0,0.0,0.0,0.0,0.013362,0.0,0.0,0.0,0.0,0.005015
5,0.011783,0.022269,0.026204,0.445645,0.03542,0.0,0.034264,0.032994,0.032994,0.032994,...,0.00497,0.050749,0.076821,0.039915,0.017101,0.033175,0.043931,0.057963,0.03656,0.0208
6,0.0,0.0,0.691258,0.061034,0.909532,0.034264,0.0,0.027466,0.027466,0.027466,...,0.0,0.0,0.0,0.0,0.008617,0.0,0.0,0.0,0.0,0.004851
7,0.006462,0.038348,0.027158,0.096106,0.027817,0.032994,0.027466,0.0,1.0,1.0,...,0.045449,0.031923,0.051214,0.012173,0.037546,0.032999,0.023235,0.043152,0.01,0.033205
8,0.006462,0.038348,0.027158,0.096106,0.027817,0.032994,0.027466,1.0,0.0,1.0,...,0.045449,0.031923,0.051214,0.012173,0.037546,0.032999,0.023235,0.043152,0.01,0.033205
9,0.006462,0.038348,0.027158,0.096106,0.027817,0.032994,0.027466,1.0,1.0,0.0,...,0.045449,0.031923,0.051214,0.012173,0.037546,0.032999,0.023235,0.043152,0.01,0.033205


In [18]:
type(resdf.iloc[:20,3979])

pandas.core.series.Series

### Output

Выходной файл должен быть расположен в корне вашей директории в файле lab05.json

Пример решения:
```
{
    “defined” : [1,3,5,7,9],
    “other” : [2,4,6,8,10]
}
```

In [27]:
# list test docs from lk
val_list = [2049, 5, 1740, 3593, 3085, 3086, 3600, 1553, 2067, 22, 3607, 536, 25, 538, 3612, 2079, 2592, 2084, 1232, 2602, 3627, 1072, 777, 1076, 3125, 56, 570, 572, 1599, 3649, 1602, 2628, 1608, 586, 587, 78, 1550, 3665, 85, 89, 1626, 1115, 1131, 1647, 2676, 633, 1658, 2174, 1152, 1710, 3715, 2692, 645, 646, 1160, 1674, 3212, 3730, 1174, 2200, 3738, 1692, 1695, 1188, 1302, 678, 3239, 174, 1200, 177, 3763, 180, 3253, 2230, 2744, 1209, 1723, 701, 1214, 1217, 196, 887, 3272, 3788, 2582, 2255, 720, 3795, 1748, 1146, 2265, 1244, 1759, 3808, 3299, 3300, 3818, 3822, 241, 1269, 1270, 2688, 1277, 766, 2816, 3842, 262, 1289, 3850, 3340, 3343, 275, 2326, 1305, 2332, 1311, 3362, 3292, 808, 298, 2861, 2350, 3375, 2526, 2355, 822, 301, 1849, 3387, 1347, 1860, 3398, 3913, 2386, 3925, 2902, 3415, 2618, 869, 2918, 2450, 3433, 2411, 3438, 1911, 888, 383, 1920, 2436, 2950, 907, 909, 1423, 1936, 1425, 402, 836, 3067, 2457, 496, 2973, 414, 2374, 1441, 2467, 668, 1961, 1963, 943, 2377, 2634, 3001, 2492, 3007, 3009, 450, 1991, 2504, 971, 3030, 3545, 3547, 1616, 989, 1019, 479, 482, 2812, 307, 2030, 1519, 3052, 498, 2035, 1262, 503, 504, 2043, 2044, 3564]

In [20]:
# creating lists of answers
defined = []
other = []
rate = 0.1  #подобрал  опытным путем
for vacancy in val_list:
    if max(resdf.iloc[:20,vacancy+19]) >= rate:
        defined.append(vacancy)
    else:
        other.append(vacancy)
    

In [26]:
len(defined), len(other)

(103, 97)

In [23]:
import json

In [28]:
# creating output json file
with open('lab05.json','w') as file:
    json.dump({ 'defined' : defined, 'other' : other }, file)