In [1]:
!pip install transformers
!pip install newsapi-python
!pip install flask-ngrok

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/37/ba/dda44bbf35b071441635708a3dd568a5ca6bf29f77389f7c7c6818ae9498/transformers-2.7.0-py3-none-any.whl (544kB)
[K     |████████████████████████████████| 552kB 7.5MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 45.8MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 41.2MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████

In [0]:
from flask import Flask
from flask_ngrok import run_with_ngrok
from newsapi import NewsApiClient
import json
import datetime
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import requests
from bs4 import BeautifulSoup
import torch

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




In [0]:
def match_class(target):
    def do_match(tag):
        classes = tag.get('class', [])
        return all(c in classes for c in target)
    return do_match


def get_important_info(articles):
    important = {'articles': []}
    for art in articles['articles']:
        if art['title'] is None or art['description'] is None or \
                art['urlToImage'] is None or art['publishedAt'][:10] is None or \
                art['url'] is None:
            continue
        ness_art = {'title': art['title'],
                    'description': art['description'],
                    'image': art['urlToImage'],
                    'date': art['publishedAt'][:10],
                    'link': art['url']}
        important['articles'].append(ness_art)
    return important


def get_prev_day():
    return str(datetime.date.today()-datetime.timedelta(1))


def generate_text(text, maxlen, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    indexed_tokens = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
    indexed_tokens = indexed_tokens.to(device)
    output_sequences = model.generate(
        input_ids=indexed_tokens,
        max_length=maxlen,
        temperature=1,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1.0,
        do_sample=True,
        num_return_sequences=1,
    )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()
    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        total_sequence = (text + text[len(tokenizer.decode(indexed_tokens[0],
                                                           clean_up_tokenization_spaces=True)):])
        generated_sequences.append(total_sequence)
    return clear(generated_sequences[0])


def clear(string):
    cleared = ''
    for i in range(len(string)-1, 0, -1):
        if string[i:] not in string[:i]:
            cleared = string[:i+1]
            break
    return '.'.join(cleared.split('.')[:-1])+'.'

In [5]:
app = Flask(__name__)
run_with_ngrok(app)
newsapi = NewsApiClient(api_key='ac0d27dde82341969f6645b174c34679')


@app.route('/')
def index():
    return "Hello, World!"


def get_top_articles():
    all_articles = newsapi.get_top_headlines(language='en', page=1, page_size=20)
    important = get_important_info(all_articles)
    return json.dumps(important)


@app.route('/getTags', methods=['GET'])
def get_tags():
    url = "https://yandex.ru/news/export"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html')
    classes = soup.find_all(match_class(['export__item']))
    flag = False
    anton = []
    for class_ in classes:
        try:
            tag = class_.contents[0].attrs['href'].split('.rss')[0].split('/')[-1]
            if tag == 'index':
                if not flag:
                    flag = True
                else:
                    break
            anton.append({'title': class_.text, 'tag': tag})
        except:
            break

    return json.dumps(anton, ensure_ascii=False)


@app.route('/getNewsByTag/<string:newsTag>', methods=['GET'])
def get_news_by_tag(newsTag):
    if str(newsTag).lower() == 'top':
        return get_top_articles()
    all_articles = newsapi.get_everything(q=str(newsTag).lower(), language='en', sort_by='relevancy',
                                          page=1, page_size=20, from_param=get_prev_day())
    important = get_important_info(all_articles)
    return json.dumps(important)


@app.route('/genText/<string:prefix>', methods=['GET'])
def gen_text(prefix):
    return json.dumps(generate_text(prefix, np.random.randint(100, 200), model, tokenizer))


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://dfcdfcfd.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [03/Apr/2020 07:37:59] "[37mGET /getTags HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Apr/2020 07:37:59] "[37mGET /getNewsByTag/Top HTTP/1.1[0m" 200 -
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
127.0.0.1 - - [03/Apr/2020 07:38:23] "[37mGET /genText/Sales%20are%20soaring%20but%20supply%20chain%20disruptions%20mean%20shortages%20loom HTTP/1.1[0m" 200 -
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
127.0.0.1 - - [03/Apr/2020 07:38:42] "[37mGET /genText/The%20ACT%20will%20begin%20actively%20looking%20for%20community%20transmission%20of%20coronavirus%20in%20Canberra. HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Apr/2020 07:39:01] "[37mGET /getNewsByTag/auto HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Apr/2020 07:39:12] "[37mGET /getNewsByTag/world HTTP/1.1[0m" 200 -
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
127.0.0.1 - - [03/Apr/2020 07:39:19] "[37mGET /genText/While%20people%20in%20Europe%2