In [24]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import nltk
from nltk import sent_tokenize
nltk.download('punkt')
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from setfit import SetFitModel, SetFitTrainer
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
import datasets
from datasets import Dataset, DatasetDict
from sklearn.metrics.pairwise import cosine_similarity
import trafilatura
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer)
from transformers.pipelines import AggregationStrategy
import numpy as np
import pickle
import re
import boto3
import json

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model_ner = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
model_sim = SentenceTransformer('bert-base-nli-mean-tokens')
nlp = pipeline('ner', model=model_ner, tokenizer=tokenizer, aggregation_strategy="simple")
model = SetFitModel.from_pretrained("kowshik/upsc-classification-model-v1")
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
map_ = {'agriculture': 0,'culture': 1,'defence': 2,'economy': 3,'environment': 4,'geography': 5,'governance': 6,
'health': 7,'history': 8,'international relations': 9,'polity': 10,'science&technology': 11,'society': 12,'sports': 13}
inv_map = {v: k for k, v in map_.items()}

__TableName__ = 'prod1_app_data'
client  = boto3.client('dynamodb',region_name = 'ap-south-1')
DB  = boto3.resource('dynamodb',region_name = 'ap-south-1')
table = DB.Table(__TableName__)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def get_summary(url,pid,text,summarizer,th=120):
    if len(text) > 2500:
        summary = summarizer(text, max_length= th, min_length=120, do_sample=False)[0]['summary_text']
        flag = 3 + pid*10
        data = summary
        item_summary = create_item(url,flag, data)
        response = table.put_item(Item  = item_summary)
        return(summary)
    else:
        return(text)

f = open('rm_model.pkl', 'rb')
clf = pickle.load(f)
f.close()

def create_item(url,flag, data):
    '''  
    flag = 0 > text
    flag = 1 > sentence
    flag = 2 > key Phrase 
    flag = 3 > summary 
    '''
    item = {
        'url': url,
        'flag':flag,
        'data': data,
    }
    return(item)


def get_data_url(url):
    downloaded = trafilatura.fetch_url(url)
    text_original = trafilatura.extract(downloaded)
    text_extracted = text_original.replace('\n',' ')
    flag = 0
    data = text_original
    item_complete = create_item(url,flag, data)
    response = table.put_item(Item  = item_complete)
    return(text_extracted, text_original)

def get_label(word,model_sim):
    labels = ['Environment','Geography','International Relations',
    'Polity','Governance','Health','Society','Economy','Science&Technology','Agriculture','sports']
    labels = [i.lower() for i in labels]
    embeddings_tags = model_sim.encode(labels)
    embeddings_key = model_sim.encode(word)
    probs = cosine_similarity([embeddings_key],embeddings_tags)
    label_index = np.argmax(probs)
    return(labels[label_index])

class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        return np.unique([result.get("word").strip() for result in results])
extractor = KeyphraseExtractionPipeline(model=model_name)

def get_keywords_text(url,pid, sentences, extractor):
    keywords_ = []
    for te in sentences:
        keywords_ = keywords_+ list(extractor(te))
    keywords_unq = np.unique(keywords_)
    flag = 2 +pid*10
    data = json.dumps(list(keywords_unq))
    item_key = create_item(url,  flag, data,)
    response = table.put_item(Item  = item_key)
    return(keywords_unq)

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]

    sentences_final = []
    for sent in sentences:
        if len(sent)>= 120:
            if ',' in sent:
                pos_comma = np.array([i for i in range(len(sent)) if sent.startswith(',', i)])
                to_split = np.argmin(np.abs(pos_comma - (len(sent) - pos_comma)))
                if (pos_comma[to_split])<= 50 or (len(sent)-pos_comma[to_split] <= 50):
                    sentences_final.append(sent)
                else:
                    sentences_final.append(sent[:pos_comma[to_split]])
                    sentences_final.append(sent[pos_comma[to_split] +1:])
            else:
                sentences_final.append(sent)
        else:
            sentences_final.append(sent)

    return sentences_final


def get_sentence_labels(url, pid, sentences, clf, model, sent_no=4):
    prediction_probas = clf.predict_proba(model.predict_proba(sentences))
    df = pd.DataFrame()
    df['sentences'] = sentences
    df['labels_1'] = np.argmax(prediction_probas,axis=1)
    df['prob_1'] = np.max(prediction_probas,axis=1)
    df['label_text_1'] = df['labels_1'].replace(inv_map)
    df['labels_2'] = [[list(p).index(i) for i in sorted(p, reverse=True)][1]  for p in prediction_probas]
    df['prob_2'] = [p[[list(p).index(i) for i in sorted(p, reverse=True)][1]]  for p in prediction_probas]
    df['label_text_2'] = df['labels_2'].replace(inv_map)
    df = df.sort_values('prob_1',ascending=False)
    labels = df[['sentences','label_text_1','label_text_2']][:sent_no]
    flag = 1  + pid*10
    data = json.dumps(labels.set_index('sentences').to_dict('index'))
    item_sentence = create_item(url, flag, data)
    response = table.put_item(Item  = item_sentence)

    return(labels)



def get_cuts(text, sentences_all):
    if len(text) > 3000:
        cumsum_ = np.cumsum([len(i) for i in sentences_all])
        chunks = np.round(len(text)/2500)
        cutoff_ = int(len(text)/chunks)
        cuts = [0]
        for i in np.arange(1,chunks):
            cutoff = cutoff_*i
            cut = np.argmin(np.abs(cumsum_ - cutoff))
            cuts.append(cut)
        cuts.append(len(sentences_all))

        sentences_chunks = []
        for c in range(0,len(cuts)-1):
            sentences_chunks.append(sentences_all[cuts[c]:cuts[c+1]])
        return(sentences_chunks)
    else:
        return([sentences_all])



[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [28]:
url = 'https://www.gktoday.in/topic/rbi-unveils-4-tiered-regulatory-framework-for-urban-cooperative-banks/'
text, text_act = get_data_url(url)
if len(text) > 120:
    sentences_all = split_into_sentences(text)
    sentences_chunks = get_cuts(text, sentences_all)
    pid = 0
    for payload in sentences_chunks:
        sentence_labels = get_sentence_labels(url,pid,payload,clf,model,sent_no=4)
        sentence_keywords = list(set(payload) -  set(sentence_labels.sentences.values))
        keyphrases = get_keywords_text(url,pid,sentence_keywords,extractor)
        text = ' '.join(payload)
        summary = get_summary(url,pid,text, summarizer, th= min(int(len(text)/10),240))
        pid = pid+1

In [32]:
summary

'RBI has released a four-tiered regulatory framework for categorization of the Urban Cooperative Banks (UCBs) It had also announced norms concerning the net worth and capital adequacy of these banks. This regulatory framework, based on the size of the deposits in the UCBs, seeks to balance the spirit of mutuality and cooperation found in smaller banks and those having limited areas of operations. It also aims to ensure the effective regulation of the cooperative banking sector, which is heterogeneous in nature. The previous regulatory framework classified USCs into Tier I and Tier II. The new four- tiered framework ensures differentiated regulations that will strengthen the financial soundness of UCBs.'

In [33]:

text


'RBI Unveils 4-tiered Regulatory Framework for Urban Cooperative Banks A four-tiered regulatory framework for categorization of the Urban Cooperative Banks (UCBs) was recently released by the Reserve Bank of India (RBI). It had also announced norms concerning the net worth and capital adequacy of these banks. Why is the four-tiered framework needed? The previous regulatory framework classified USCs into Tier I and Tier II. The new four-tiered framework was announced to ensure the effective regulation of the cooperative banking sector, which is heterogeneous in nature. This regulatory framework, based on the size of the deposits in the UCBs  seeks to balance the spirit of mutuality and cooperation found in smaller banks and those having limited areas of operations vis-à-vis the growth ambitions of the larger UCBs and those involved in more complex business activities. The categorization of the UCBs into four tiers ensures differentiated regulations that will strengthen the financial sou

In [13]:
clf.predict_proba(model.predict_proba(sentences))

['BTM Layout Ext Devarachikkana Halli – 5 Unit Building For Sale on 20×30 A Khata – Near RTO MULTI UNIT BUILDING FOR SALE,… ₹15,000,000 MULTI UNIT BUILDING FOR SALE,… ₹15,000,000 By EBPS 4 RENTAL INCOME BUILDING FOR SALE,… ₹95,000,000 By EBPS 4 INDEPENDENT HOUSE FOR SALE',
 ' 3BHK… ₹24,000,000 By EBPS 4 INDEPENDENT HOUSE FOR SALE, 4BHK… ₹19,900,000 By EBPS 4 MULTI UNIT PROPERTY FOR SALE,… ₹18,500,000 By EBPS 4 APARTMENT FOR SALE, 2BHK Condo… ₹4,700,000 By EBPS 1 Owning a home is a keystone of wealth… both financial affluence and emotional security.']

In [96]:
summarizer(text[:5000], max_length= 240, min_length=120, do_sample=False)

[{'summary_text': 'The 2022 Lancet Countdown on Health and Climate Change: Health at the Mercy of Fossil Fuels points out that the world’s reliance on fossil fuels increases the risk of disease, food insecurity and other illnesses related to heat. The WHO has predicted that between 2030 and 2050, climate change is expected to cause approximately 2,50,000 additional deaths per year, from malnutrition, malaria, diarrhoea and heat stress. A health-centred response to the coexisting climate, energy, and cost-of-living crises provides an opportunity to deliver a healthy, low-carbon future.'}]

In [52]:
import json
json.dumps(list(keyphrases))

'["Bali summit", "G20", "G20 Summit", "G20 presidency", "German ambassador", "India", "Indian government", "Indonesia", "Russia", "Ukraine", "consensus", "developing world", "external", "joint communique", "media briefing", "news feed", "world order"]'

In [15]:
import boto3
__TableName__ = 'prod1_app_data'
client  = boto3.client('dynamodb',region_name = 'ap-south-1')
DB  = boto3.resource('dynamodb',region_name = 'ap-south-1')
table = DB.Table(__TableName__)

In [30]:
url = 'https://www.hindustantimes.com/india-news/expect-russia-to-be-part-of-all-processes-says-india-on-g20-presidency-101669906231427.html'
auth = '1'
flag = 1
text = sentence_labels[0][0]
label_1 = sentence_labels[0][1]
label_2 = sentence_labels[0][2]

In [31]:


item = create_item(url,auth, flag,text, label_1,label_2)
response = table.put_item(Item  = item)

In [27]:
{"label_text_2":{"M":{" the Global South, such as food, fuel and fertilisers,” he said.":{"S":"society"},"However, Prime Minister Narendra Modi told Russian President Vladimir Putin at a meeting in September that today’s era is “not of war”.":{"S":"international relations"},"From time to time, both countries indicate areas of interest or priority that they may be looking at”.":{"S":"geography"},"India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war.":{"S":"defence"},"Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia":{"S":"geography"}}},"label_text_1":{"M":{" the Global South, such as food, fuel and fertilisers,” he said.":{"S":"agriculture"},"However, Prime Minister Narendra Modi told Russian President Vladimir Putin at a meeting in September that today’s era is “not of war”.":{"S":"defence"},"From time to time, both countries indicate areas of interest or priority that they may be looking at”.":{"S":"international relations"},"India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war.":{"S":"international relations"},"Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia":{"S":"international relations"}}}}

'{" the Global South, such as food, fuel and fertilisers,\\u201d he said.": {"label_text_1": "agriculture", "label_text_2": "society"}, "From time to time, both countries indicate areas of interest or priority that they may be looking at\\u201d.": {"label_text_1": "international relations", "label_text_2": "geography"}, "However, Prime Minister Narendra Modi told Russian President Vladimir Putin at a meeting in September that today\\u2019s era is \\u201cnot of war\\u201d.": {"label_text_1": "defence", "label_text_2": "international relations"}, "Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia": {"label_text_1": "international relations", "label_text_2": "geography"}, "India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war.": {"label_te

{0: 'agriculture',
 1: 'culture',
 2: 'defence',
 3: 'economy',
 4: 'environment',
 5: 'geography',
 6: 'governance',
 7: 'health',
 8: 'history',
 9: 'international relations',
 10: 'polity',
 11: 'science&technology',
 12: 'society',
 13: 'sports'}

In [139]:


model.predict_proba(['Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia'])



array([[0.04873253, 0.09007206, 0.07610773, 0.03673253, 0.04833547,
        0.05789667, 0.2159922 , 0.05296119, 0.0213202 , 0.10592079,
        0.04914273, 0.04692862, 0.10922971, 0.04062758]])

In [36]:
model.predict_proba(['The decisive moment will be September [2023] when the [G20] summit comes together.'])

array([[0.05802507, 0.24399796, 0.04785446, 0.04873429, 0.03565255,
        0.065207  , 0.09973415, 0.07628193, 0.01154642, 0.0455373 ,
        0.06359865, 0.06889346, 0.09427411, 0.04066266]])

In [37]:
map_

{'agriculture': 0,
 'culture': 1,
 'defence': 2,
 'economy': 3,
 'environment': 4,
 'geography': 5,
 'governance': 6,
 'health': 7,
 'history': 8,
 'international relations': 9,
 'polity': 10,
 'science&technology': 11,
 'society': 12,
 'sports': 13}

In [22]:
len('Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia, the previous president, played a key role in finalising a joint communique at the Bali summit amid deep divisions between Russia and the West')

289

In [61]:
len('Expect Russia to be part of all processes')

41

In [93]:
sum([len(i) for i in sentences_final])

3178

In [94]:
sum([len(i) for i in sentences])

3186

In [96]:
len(sentences_final)

29

In [78]:
s =  'Expect Russia to be part of all processes, says India on G20 presidency India which began its year-long G20 presidency on Thursday '

In [79]:
pos_comma = s.split(',', 1)


In [80]:
pos_comma = np.array([i for i in range(len(s)) if s.startswith(',', i)])

In [82]:
np.argmin(np.abs(pos_comma - (len(s) - pos_comma)))

0

In [74]:
pos_comma[3]

146

In [75]:
s[:146]

'Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia'

In [77]:
s[147:]

' the previous president, played a key role in finalising a joint communique at the Bali summit amid deep divisions between Russia and the West'

In [71]:
 (len(s) - pos_comma)

array([248, 212, 158, 143, 119])

In [None]:
def get_predictions_model(text, model, map_):
    predictions = model.predict_proba(text.split('.'))
    predictions_label = []
    for i in predictions:
        if np.max(i)> th:
            predictions_label.append(np.argmax(predictions[0])) 
        else:
            predictions_label.append(None) 
    return(text.split('.'), predictions_label)

In [17]:
predictions = model.predict(['environment'])

{'agriculture': 0,
 'culture': 1,
 'defence': 2,
 'economy': 3,
 'environment': 4,
 'geography': 5,
 'governance': 6,
 'health': 7,
 'history': 8,
 'international relations': 9,
 'polity': 10,
 'science&technology': 11,
 'society': 12,
 'sports': 13}

In [18]:
predictions

array([4])

In [12]:
sentences

['Expect Russia to be part of all processes, says India on G20 presidency India, which began its year-long G20 presidency on Thursday, and Indonesia, the previous president, played a key role in finalising a joint communique at the Bali summit amid deep divisions between Russia and the West',
 ' India said on Thursday it expects Russia to be part of all the processes of G20 as it assumed the presidency of the grouping against the backdrop of persisting differences among its members over the Ukraine war',
 ' “Russia is a member of the G20 and hence we would expect them to be participating in these processes,” external affairs ministry spokesperson Arindam Bagchi told a weekly media briefing while responding to questions about the divisions within the G20 over Russia’s invasion of Ukraine',
 ' Bagchi noted that the G20 works on the important principle of consensus, and India’s efforts as president of the grouping of the world’s 20 largest economies will be aimed at building consensus',
 

In [5]:
sentences

['PMLA Amendments In its recent amendment, the Central Government listed 15 government bodies that are required to share information with the Enforcement Directorate under the Prevention of Money Laundering Act (PMLA)',
 ' The list was amended in accordance with Section 66 of the PMLA',
 ' Contents Which are the agencies that were recently included in the list? The government entities that were recently included to share information with the Enforcement Directorate are: - National Investigating Agency (NIA) - Serious Fraud Investigation Office (SFIO) - State Police - Director General of Foreign Trade (DGFT) - Ministry of External Affairs (MEA) - National Intelligence Grid - Central Vigilance Commission (CVC) - Defence Intelligence Agency - National Technical Research Organisation (NTRO) - Military Intelligence - Wildlife Crime Control Bureau - Competition Commission of India (CCI) Under Section 66 of the Act, 15 entities are bound to disclose and share case information to the ED if the

In [57]:

keywords = []
for sent in text.split('.'):
    for i in nlp(sent):
        keywords.append(i['word'])



In [61]:
np.unique(keywords)

array(['Besides', 'Covid', 'Disaster risk management', 'G20', 'G20 forum',
       'Global South', 'India', 'India’', 'Modi administration',
       'NEW DELHI', 'Sherpa Amitabh Kant', 'Sources', 'TOI', 'Thursday'],
      dtype='<U24')

In [62]:
text



In [40]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [48]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN','PROPN', 'VERB'], window_size=10, lower=False)
tr4w.get_keywords(10)

India - 4.475863058211749
G20 - 3.210650043769627
agenda - 2.7650726053998467
countries - 2.482918651168669
issues - 2.4047668706634053
climate - 2.0973085173341097
time - 1.8610261827381995
world - 1.7904226189397052
areas - 1.674571110650254
debt - 1.6570284554390828
crises - 1.596109029543607
seek - 1.5788841729121734




In [11]:
for i in nlp(df.headings.values[14]):
    print(i['word'], get_label(i['word'],model_sim))

TypeError: get_label() missing 1 required positional argument: 'model_sim'

In [14]:
get_label('apple',model_sim)

'agriculture'

In [10]:
df.headings.values[14]

'Bangladesh allows commercial cultivation of Bt Brinjal'

In [4]:
df = pd.read_pickle('data/training_data.pkl')

In [15]:
pip install trafilatura

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting trafilatura
  Downloading trafilatura-1.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting justext>=3.0.0
  Downloading jusText-3.0.0-py2.py3-none-any.whl (837 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.8/837.8 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting htmldate>=1.3.2
  Downloading htmldate-1.4.0-py3-none-any.whl (33 kB)
Collecting lxml>=4.6.4
  Downloading lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [22]:
text_extracted.replace('\n',' ')

'I\'m writing some code (Python) to scrape text from web pages. My goal is to find a way to filter/delete the paragraphs on webpages that are not in the main article (e.g. advertisement, links to other articles, etc.). So far I\'ve been using the .find_all("p") command to extract only paragraphs from the text, which although successful also scrapes a lot of rudimentary paragraphs which are not in the main/body of each article. This is my code now: from urllib.request import Request, urlopen from bs4 import BeautifulSoup URLs = [ "https://www.elsoldetoluca.com.mx/local/proponen-sistemas-para-captar-agua-pluvial-en-el-edomex-6585661.html", "https://www.elsoldetoluca.com.mx/local/agua-de-acuifero-del-valle-de-toluca-solo-debe-ser-para-uso-de-consumo-humano-especialista-4146232.html" ] for url in URLs: req = Request(url, headers={"User-Agent": \'Mozilla/5.0\'}) page = urlopen(req) paragraphs = [] htmlParse = BeautifulSoup(page.read(), \'lxml\') for para in htmlParse.find_all("p"): paragrap