In [None]:
"""
Классификация выполняется с помощью knn.
Для нахождения соседей используется модифицированный алгоритм поиска с инвертированным индексом,
поиск соседей аналогичен поиску документов,
в качестве меры близости берётся релевантность.
"""

import requests
import lxml.html
import lxml.etree
import pymorphy2
import sqlite3
import re
import bisect
import math
import time
import os.path
from collections import namedtuple, Counter, defaultdict
from tqdm import tqdm_notebook
last_update = -1
morph = pymorphy2.MorphAnalyzer()
DocEntry = namedtuple('DocEntry', ['doc_id', 'positions'])
bot_url = "https://api.telegram.org/bot458383276:AAHrdxZWt89T4KVi_ote8k_g3CXNs7HOmgc/"

#загрузка новостей
def load_news(path, db_name, start_year, fin_year):
    try:
        f = open(path + db_name + '.db', 'a')
        f.close()
        conn = sqlite3.connect(path + db_name + '.db')
        cursor = conn.cursor()
        cursor.execute('create table if not exists categories (id integer not null, name text unique not null, primary key (id))')
        cursor.execute('create table if not exists news (id integer not null, category integer, title text, article text, primary key (id), foreign key (category) references categories(id))')
        conn.commit()
        f = open(path + db_name + '.txt', 'w', encoding='utf8')
        for year in tqdm_notebook(range(start_year, fin_year + 1)):
            for month in tqdm_notebook(range(1, 13), leave = False):
                if month == 2:
                    i = 30
                else:
                    i = 32
                for day in tqdm_notebook(range(1, i), leave = False):
                    if month < 10:
                        m = '0'
                    else:
                        m = ''
                    if day < 10:
                        d = '0'
                    else:
                        d = ''
                    num = 0
                    while True:
                        num = num + 1
                        article = ''
                        if num < 10:
                            article = article + '0'
                        if num < 100:
                            article = article + '0'
                        article = article + str(num)
                        s_year = str(year)
                        s_mon = m + str(month)
                        s_day = d + str(day)
                        db_id = s_year + s_mon + s_day + article
                        cursor.execute('select id from news where id = ' + db_id)
                        r = cursor.fetchall()
                        if len(r) == 0:
                            response = requests.get('http://www.fontanka.ru/' + s_year + '/' + s_mon + '/' + s_day + '/' + article)
                            if(response.status_code != 200):
                                break
                            tree = lxml.html.fromstring(response.text)
                            t = tree.xpath('//span[contains(@class, "light")]/text()')                #категория
                            if len(t) == 0:
                                continue
                            category = ((t[0]).replace('"', '´´')).replace("'", "´")
                            cursor.execute('insert or ignore into categories (name) values ("' + category + '")')
                            conn.commit()
                            cursor.execute('select id from categories where name = "' + category + '"')
                            category = str((cursor.fetchall())[0][0])
                            t = tree.xpath('//h1[contains(@class, "article_title")]/text()')          #заголовок  
                            title = ''
                            if len(t) > 0:
                                spl = re.split('\W+', t[0])
                                for word in spl:
                                    title += ((morph.parse(word)[0].normal_form) + ' ')
                            t = tree.xpath('//div[contains(@class, "article_fulltext")]/p/text()')    #текст
                            article = ''
                            if len(t) > 0:
                                for part in t:
                                    spl = re.split('\W+', part)
                                    for word in spl:
                                        article += ((morph.parse(word)[0].normal_form) + ' ')
                            f.write(db_id + "\n" + category + "\n" + title + '\n' + article + '\n\n')
                            cursor.execute('insert into news values (' + db_id + ', ' + category + ', "' + title + '", "' + article + '")')
                            conn.commit()
        f.close()
        conn.close()
    except:
        return False
    else:
        return True

def load_news_retr(path, db_name, start_year, fin_year, c):
    i = 0
    while i < c and not (load_news(path, db_name, start_year, fin_year)):
        i += 1
        time.sleep(5)
    
# удаление новостей мелких категорий
def clear_cat(path, db_name, threshold = 500):
    try:
        conn = sqlite3.connect(path + db_name + '.db')
        cursor = conn.cursor()
        cursor.execute('select count(*) from categories')
        cat_num = (cursor.fetchall())[0][0]
        for i in tqdm_notebook(range(1, cat_num + 1)):
            cursor.execute('select count(*) from news where category = ' + str(i))
            if (cursor.fetchall())[0][0] < threshold:
                cursor.execute('delete from news where category = ' + str(i))
                conn.commit()
        conn.close()
    except:
        return False
    else:
        return True
    
def parse_text(text):
    words = (word for word in re.split('\W+', text) if len(word) > 0)
    lexems = (morph.normal_forms(word)[0] for word in words)
    
    return list(lexems)

class InvertedIndex:
    
    def __init__(self):
        self.dict = defaultdict(list)
        self.texts = dict()
        self.l = 0
        self.categories = {}
        
    def add(self, text, category, doc_id, is_normalized):
        self.texts[doc_id] = (len(list(word for word in re.split('\W+', text) if len(word) > 0)), category)
        self.l += 1
        
        if is_normalized:
            words = text.split()
        else:
            words = parse_text(text)
        
        word_to_entry = defaultdict(lambda: [])
        
        for pos, word in enumerate(words):
            doc_entry = word_to_entry[word]
            doc_entry.append(pos)
            
        for word, positions in word_to_entry.items():
            entry = DocEntry(doc_id, positions)
            postings = self.dict[word]
            postings.insert(bisect.bisect_left(postings, entry), entry) 
            
    def update_categories(self, cat_list):
        for item in cat_list:
            self.categories[item[0]] = item[1]
            
    def get_postings(self, word):
        return self.dict[word]
    
    def search(self, text):
        rate = {}
        for word in parse_text(text):
            a = self.get_postings(word)
            if len(a) > 0:
                idf = math.log10(self.l / len(a))
                for element in a:
                    rate[element[0]] = rate.get(element[0], 0) + (len(element[1]) / self.texts[element[0]][0]) * idf
        rate = sorted(rate.items(), key = lambda x: -x[1])
        return rate
    
    def classify(self, text, num):
        n = self.search(text)
        rate = {}
        for i in range(num):
            rate[(self.texts[(n[i])[0]])[1]] = rate.get((self.texts[(n[i])[0]])[1], 0) + 1
        rate = max(rate.items(), key = lambda x: x[1])
        return rate[0]
    
    def get_category(self, text, neighbors):
        category = self.classify(text, neighbors)
        return self.categories.get(category, 'None')
    
#обучение
def learning(path, db_name):
    f = open(path + db_name + '.db', 'a')
    f.close()
    conn = sqlite3.connect(path + db_name + '.db')
    cursor = conn.cursor()
    index = InvertedIndex()
    cursor.execute('select id, category, article from news')
    result = cursor.fetchall()
    for item in tqdm_notebook(result):
        if len(item[2]) > 7:
            index.add(item[2], item[1], item[0], True)
    cursor.execute('select id, name from categories')
    index.update_categories(cursor.fetchall())
    conn.close()      
    return index

#Telegram-bot
def req(method, params):
    return requests.post(bot_url + method, params)

def get_updates(timeout = 30):
    global last_update
    params = {'timeout': timeout, 'offset': last_update + 1}
    response = (req('getUpdates', params)).json()
    if response['ok']:
        if len(response['result']) > 0:
            last_update = response['result'][-1]['update_id']
        return response
    return False

def send_message(chat_id, text):
    params = {'chat_id': chat_id, 'text': text}
    return (req('sendMessage', params)).json()

def main():
    path = 'D:\\news\\'
    db_name = 'news'
    master_id = 82389568
    start_year = 2015
    fin_year = 2017
    neighbors = 10
    
    if not os.path.isfile(path + db_name + '.db'):
        print('Loading news...')
        load_news_retr(path, db_name, start_year, fin_year, 10)
    print('Learning...')
    index = learning(path, db_name)
    print('Bot started')
    
    while True:
        r = get_updates()
        if(r):
            for i in r['result']:
                text = i['message']['text']
                chat_id = i['message']['chat']['id']
                u_id = i['message']['from']['id']
                if u_id != chat_id:
                    continue
                if u_id == master_id:
                    t_low = text.lower()
                    if (t_low == 'load_news'):
                        send_message(chat_id, 'Загрузка новостей...')
                        if load_news(path, db_name, start_year, fin_year):
                            send_message(chat_id, 'Успешно')
                        else:
                            send_message(chat_id, 'Ошибка')
                        continue
                    if (t_low == 'clear_categories'):
                        send_message(chat_id, 'Очистка категорий...')
                        if clear_cat(path, db_name):
                            send_message(chat_id, 'Успешно')
                        else:
                            send_message(chat_id, 'Ошибка')
                        continue
                    if (t_low == 'news_learning'):
                        send_message(chat_id, 'Обучение...')
                        try:
                            index = learning(path, db_name)
                        except:
                            send_message(chat_id, 'Ошибка')
                        else:
                            send_message(chat_id, 'Успешно')
                        continue
                    if (t_low == 'stop_bot'):
                        send_message(chat_id, 'Остановка бота.')
                        get_updates(1)
                        return
                if text == '/start':
                    send_message(chat_id, 'Отправьте боту текст новости для получения категории')
                    continue
                try:
                    cat = index.get_category(text, neighbors)
                    send_message(chat_id, cat)
                except:
                    send_message(chat_id, 'Ошибка')
    return

if __name__ == '__main__':  
    try:
        main()
    except KeyboardInterrupt:
        exit()