In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import nltk
from nltk import sent_tokenize
nltk.download('punkt')
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from setfit import SetFitModel, SetFitTrainer
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
import datasets
from datasets import Dataset, DatasetDict
from sklearn.metrics.pairwise import cosine_similarity
import trafilatura
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer)
from transformers.pipelines import AggregationStrategy
import numpy as np
import pickle
import re
import boto3
import json

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model_ner = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
model_sim = SentenceTransformer('bert-base-nli-mean-tokens')
nlp = pipeline('ner', model=model_ner, tokenizer=tokenizer, aggregation_strategy="simple")
model = SetFitModel.from_pretrained("kowshik/upsc-classification-model-v1")
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
map_ = {'agriculture': 0,'culture': 1,'defence': 2,'economy': 3,'environment': 4,'geography': 5,'governance': 6,
'health': 7,'history': 8,'international relations': 9,'polity': 10,'science&technology': 11,'society': 12,'sports': 13}
inv_map = {v: k for k, v in map_.items()}

__TableName__ = 'prod1_app_data'
client  = boto3.client('dynamodb',region_name = 'ap-south-1')
DB  = boto3.resource('dynamodb',region_name = 'ap-south-1')
table = DB.Table(__TableName__)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def get_summary(url,pid,text,summarizer,th=120):
    if len(text) > 1200:
        summary = summarizer(text, max_length= th, min_length=120, do_sample=False)[0]['summary_text']
        flag = 3 + pid*10
        data = summary
        item_summary = create_item(url,flag, data)
        response = table.put_item(Item  = item_summary)
        return(summary)
    else:
        flag = 3 + pid*10
        data = text
        item_summary = create_item(url,flag, data)
        response = table.put_item(Item  = item_summary)
        return(text)

f = open('rm_model.pkl', 'rb')
clf = pickle.load(f)
f.close()

def create_item(url,flag, data):
    '''  
    flag = 0 > text
    flag = 1 > sentence
    flag = 2 > key Phrase 
    flag = 3 > summary 
    '''
    item = {
        'url': url,
        'flag':flag,
        'data': data,
    }
    return(item)


def get_data_url(url):
    downloaded = trafilatura.fetch_url(url)
    text_original = trafilatura.extract(downloaded)
    text_extracted = text_original.replace('\n',' ')
    flag = 0
    data = text_original
    item_complete = create_item(url,flag, data)
    response = table.put_item(Item  = item_complete)
    return(text_extracted, text_original)

def get_label(word,model_sim):
    labels = ['Environment','Geography','International Relations',
    'Polity','Governance','Health','Society','Economy','Science&Technology','Agriculture','sports']
    labels = [i.lower() for i in labels]
    embeddings_tags = model_sim.encode(labels)
    embeddings_key = model_sim.encode(word)
    probs = cosine_similarity([embeddings_key],embeddings_tags)
    label_index = np.argmax(probs)
    return(labels[label_index])

class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
extractor = KeyphraseExtractionPipeline(model=model_name)

def get_keywords_text(url,pid, sentences, extractor):
    keywords_ = []
    for te in sentences:
        keywords_ = keywords_+ list(extractor(te))
    keywords_unq = np.unique(keywords_)
    flag = 2 +pid*10
    data = json.dumps(list(keywords_unq))
    item_key = create_item(url,  flag, data,)
    response = table.put_item(Item  = item_key)
    return(keywords_unq)

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    sentences_final = []
    for sent in sentences:
        if len(sent)>= 120:
            if ',' in sent:
                pos_comma = np.array([i for i in range(len(sent)) if sent.startswith(',', i)])
                to_split = np.argmin(np.abs(pos_comma - (len(sent) - pos_comma)))
                if (pos_comma[to_split])<= 50 or (len(sent)-pos_comma[to_split] <= 50):
                    sentences_final.append(sent)
                else:
                    sentences_final.append(sent[:pos_comma[to_split]])
                    sentences_final.append(sent[pos_comma[to_split] +1:])
            else:
                sentences_final.append(sent)
        else:
            sentences_final.append(sent)

    return sentences_final


def get_sentence_labels(url, pid, sentences, clf, model, sent_no=4):
    prediction_probas = clf.predict_proba(model.predict_proba(sentences))
    df = pd.DataFrame()
    df['sentences'] = sentences
    df['labels_1'] = np.argmax(prediction_probas,axis=1)
    df['prob_1'] = np.max(prediction_probas,axis=1)
    df['label_text_1'] = df['labels_1'].replace(inv_map)
    df['labels_2'] = [[list(p).index(i) for i in sorted(p, reverse=True)][1]  for p in prediction_probas]
    df['prob_2'] = [p[[list(p).index(i) for i in sorted(p, reverse=True)][1]]  for p in prediction_probas]
    df['label_text_2'] = df['labels_2'].replace(inv_map)
    df = df.sort_values('prob_1',ascending=False)
    labels = df[['sentences','label_text_1','label_text_2']][:sent_no]
    flag = 1  + pid*10
    data = json.dumps(labels.set_index('sentences').to_dict('index'))
    item_sentence = create_item(url, flag, data)
    response = table.put_item(Item  = item_sentence)

    return(labels)



def get_cuts(text, sentences_all):
    if len(text) > 3000:
        cumsum_ = np.cumsum([len(i) for i in sentences_all])
        chunks = np.round(len(text)/2500)
        cutoff_ = int(len(text)/chunks)
        cuts = [0]
        for i in np.arange(1,chunks):
            cutoff = cutoff_*i
            cut = np.argmin(np.abs(cumsum_ - cutoff))
            cuts.append(cut)
        cuts.append(len(sentences_all))
        
        sentences_chunks = []
        for c in range(0,len(cuts)-1):
            sentences_chunks.append(sentences_all[cuts[c]:cuts[c+1]])
        return(sentences_chunks)
    else:
        return([sentences_all])


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
'''  
import boto3
df = pd.read_pickle('/home/ubuntu/scrape_upsc_db/data/article_links.pkl')
links = df[:100]
items_list = []
for i in links:
    items = {
            'user': 'kowshikchilamkurthy@gmail.com',
            'url':i,
        }
    items_list.append(items)

__TableName__ = 'prod1_user_data'
client_user  = boto3.client('dynamodb',region_name = 'ap-south-1')
DB_user  = boto3.resource('dynamodb',region_name = 'ap-south-1')
table_user = DB_user.Table(__TableName__)
for item_ in items_list:
    response = table_user.put_item(Item  = item_)
'''

In [3]:
url = 'https://www.thehindu.com/news/national/all-party-meet-ahead-of-budget-session-opposition-raises-adani-issue-ysr-congress-calls-for-caste-based-economic-census/article66450221.ec'
text, text_act = get_data_url(url)
if len(text) > 120:
    sentences_all = split_into_sentences(text)
    sentences_chunks = get_cuts(text, sentences_all)
    pid = 0
    for payload in sentences_chunks:
        sentence_labels = get_sentence_labels(url,pid,payload,clf,model,sent_no=4)
        sentence_keywords = list(set(payload) -  set(sentence_labels.sentences.values))
        keyphrases = get_keywords_text(url,pid,sentence_keywords,extractor)
        text = ' '.join(payload)
        summary = get_summary(url,pid,text, summarizer, th= min(int(len(text)/10),240))
        pid = pid+1

TypeError: ('incompatible input type', <class 'NoneType'>)

Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/upsc_models_deployment/deployenv/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/upsc_models_deployment/deployenv/lib/python3.10/site-packages/traitlets/config/application.py", line 982, in launch_instance
    app.start()
  File "/home/ubuntu/upsc_models_deployment/deployenv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/home/ubuntu/upsc_models_deployment/deployenv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
    self._run_once()
  File "/usr/lib/p

: 

In [None]:
summary

In [None]:
str_ = 'Policy Change Request Effective 05/29/2020 Please remove the PO Box from the insured’s mailing address.  They no longer have it.  The mailing address should read: 4 South Main Street Haydenville, MA  01039 All other aspects of the policy should remain unchanged.'

In [None]:
extractor(str_, )

In [None]:
for i in keyphrases:
    print(i,';',inv_map[clf.predict(model.predict_proba([i]))[0]] )

In [None]:
clf.predict_proba(model.predict_proba(sentences))

In [None]:
summarizer(text[:5000], max_length= 240, min_length=120, do_sample=False)

In [None]:
import json
json.dumps(list(keyphrases))

In [None]:
import boto3
__TableName__ = 'prod1_app_data'
client  = boto3.client('dynamodb',region_name = 'ap-south-1')
DB  = boto3.resource('dynamodb',region_name = 'ap-south-1')
table = DB.Table(__TableName__)

In [None]:
url = 'https://www.hindustantimes.com/india-news/expect-russia-to-be-part-of-all-processes-says-india-on-g20-presidency-101669906231427.html'
auth = '1'
flag = 1
text = sentence_labels[0][0]
label_1 = sentence_labels[0][1]
label_2 = sentence_labels[0][2]

In [None]:


item = create_item(url,auth, flag,text, label_1,label_2)
response = table.put_item(Item  = item)

In [None]:
{"label_text_2":{"M":{" the Global South, such as food, fuel and fertilisers,” he said.":{"S":"society"},"However, Prime Minister Narendra Modi told Russian President Vladimir Putin at a meeting in September that today’s era is “not of war”.":{"S":"international relations"},"From time to time, both countries indicate areas of interest or priority that they may be looking at”.":{"S":"geography"},"India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war.":{"S":"defence"},"Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia":{"S":"geography"}}},"label_text_1":{"M":{" the Global South, such as food, fuel and fertilisers,” he said.":{"S":"agriculture"},"However, Prime Minister Narendra Modi told Russian President Vladimir Putin at a meeting in September that today’s era is “not of war”.":{"S":"defence"},"From time to time, both countries indicate areas of interest or priority that they may be looking at”.":{"S":"international relations"},"India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war.":{"S":"international relations"},"Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia":{"S":"international relations"}}}}

In [None]:


model.predict_proba(['Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia'])



In [None]:
model.predict_proba(['The decisive moment will be September [2023] when the [G20] summit comes together.'])

In [None]:
map_

In [None]:
len('Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia, the previous president, played a key role in finalising a joint communique at the Bali summit amid deep divisions between Russia and the West')

In [None]:
len('Expect Russia to be part of all processes')

In [None]:
sum([len(i) for i in sentences_final])

In [None]:
sum([len(i) for i in sentences])

In [None]:
len(sentences_final)

In [None]:
s =  'Expect Russia to be part of all processes, says India on G20 presidency India which began its year-long G20 presidency on Thursday '

In [None]:
pos_comma = s.split(',', 1)


In [None]:
pos_comma = np.array([i for i in range(len(s)) if s.startswith(',', i)])

In [None]:
np.argmin(np.abs(pos_comma - (len(s) - pos_comma)))

In [None]:
pos_comma[3]

In [None]:
s[:146]

In [None]:
s[147:]

In [None]:
 (len(s) - pos_comma)

In [None]:
def get_predictions_model(text, model, map_):
    predictions = model.predict_proba(text.split('.'))
    predictions_label = []
    for i in predictions:
        if np.max(i)> th:
            predictions_label.append(np.argmax(predictions[0])) 
        else:
            predictions_label.append(None) 
    return(text.split('.'), predictions_label)

In [None]:
predictions = model.predict(['environment'])

In [None]:
predictions

In [None]:
sentences

In [None]:
sentences

In [None]:

keywords = []
for sent in text.split('.'):
    for i in nlp(sent):
        keywords.append(i['word'])



In [None]:
np.unique(keywords)

In [None]:
text

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN','PROPN', 'VERB'], window_size=10, lower=False)
tr4w.get_keywords(10)

In [None]:
for i in nlp(df.headings.values[14]):
    print(i['word'], get_label(i['word'],model_sim))

In [None]:
get_label('apple',model_sim)

In [None]:
df.headings.values[14]

In [None]:
df = pd.read_pickle('data/training_data.pkl')

In [None]:
pip install trafilatura

In [None]:
text_extracted.replace('\n',' ')

In [None]:
import pydantic

In [None]:
ENTRYPOINT ["uvicorn main:app --host", "0.0.0.0", "--port", "80"]
