In [36]:
import pandas as pd
from anytree import Node, RenderTree, PreOrderIter
import yake

import pytextrank
tr = pytextrank.TextRank()

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

# Load tree and data

In [2]:
import pickle
def save_obj(obj, name):
    with open('../data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('../data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [53]:
tree_saved = load_obj("root")

In [4]:
df = pd.read_csv('../data/news_sample_10000.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   level_0     10000 non-null  int64 
 1   index       10000 non-null  int64 
 2   News        10000 non-null  object
 3   clean_news  10000 non-null  object
dtypes: int64(2), object(2)
memory usage: 312.6+ KB


In [6]:
df.head()

Unnamed: 0,level_0,index,News,clean_news
0,142992,143119,'Human skull found near house in Vizag create...,'human skull found near house vizag creates fu...
1,192109,192274,'#TheatreLove: OTT will never be a threat to ...,'#theatrelove: ott will never threat theatres
2,40444,40511,'Hyderabad: Haneef chacha’s auto doubles up a...,'hyderabad: haneef chacha’s auto doubles free ...
3,115071,115187,'Kochi: Facing supply squeeze,'kochi: facing supply squeeze
4,83454,83550,'Beed district collector’s order results in c...,'beed district collector’s order results chaos


In [7]:
def print_tree(root):
    for pre, fill, node in RenderTree(root):
        print("%s%s" % (pre, node.name))

In [10]:
print_tree(tree_saved)

RangeIndex(start=0, stop=10000, step=1)
├── Int64Index([   0,    1,    2,    3,    4,    5,    6,    8,    9,   10,
            ...
            9989, 9990, 9991, 9992, 9993, 9995, 9996, 9997, 9998, 9999],
           dtype='int64', length=8762)
│   ├── Int64Index([   6,   26,   44,   45,   59,   61,   63,   68,   74,   81,
            ...
            9914, 9927, 9937, 9948, 9966, 9970, 9973, 9976, 9989, 9991],
           dtype='int64', length=1563)
│   │   ├── Int64Index([ 123,  249,  303,  389, 1225, 1682, 2445, 2745, 2775, 2786, 3105,
            3436, 3467, 3847, 4279, 4494, 4523, 4568, 4688, 4748, 4823, 4848,
            5216, 5324, 5365, 5377, 5814, 6013, 6213, 6891, 7041, 7610, 7892,
            8069, 8096, 8302, 8809, 9052, 9258, 9296, 9420, 9690],
           dtype='int64')
│   │   ├── Int64Index([  26,   59,   81,  114,  169,  227,  228,  457,  496,  574,
            ...
            9379, 9444, 9471, 9518, 9519, 9620, 9669, 9758, 9864, 9976],
           dtype='int64', length=181

# Yake Theme Extraction

In [47]:
def remove_dup_words(text):
    l=[]
    if type(text) == str:
        l = text.split()
    elif type(text) == list:
        l = text
    k = []
    for i in l:
        # If condition is used to store unique string
        # in another list 'k'
        if (text.count(i) > 1 and (i not in k) or text.count(i) == 1):
            k.append(i)
    if type(text) == list:
        return k
    else:
        return ' '.join(k)

In [52]:
def yake_extraction(text, ngram_size, words_per_theme=5):
    language = "en"
    max_ngram_size = ngram_size
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = words_per_theme

    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size,
                                                dedupLim=deduplication_thresold,
                                                dedupFunc=deduplication_algo,
                                                windowsSize=windowSize,
                                                top=numOfKeywords,
                                                features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    top_n_words = []
    for kw in keywords:
        top_n_words.append(kw[1])
    top_n_words = [remove_dup_words(name) for name in top_n_words]
    top_n_words = remove_dup_words(top_n_words)
    return ", ".join(top_n_words)

In [49]:
def extract_keywords_tree(root):
    for node in PreOrderIter(root):
        result = []
        for i in node.name:
            temp = df.iloc[i].clean_news
            result.append(temp)
            
        pure_text = '. '.join(result)
        
        node.name = yake_extraction(pure_text,2) + ' | SIZE: ' + str(len(result))
#         node.name = ', '.join(pytextrank_extraction(pure_text, 4)) + ' | SIZE: ' + str(len(result))

In [54]:
extract_keywords_tree(tree_saved)

In [55]:
print_tree(tree_saved)

cases, covid, covid cases, man, delhi | SIZE: 10000
├── man, govt, delhi, tamil nadu, covid | SIZE: 8762
│   ├── tamil nadu, man, held, arrested, nadu | SIZE: 1563
│   │   ├── man, lakh, msu, youth, rituals | SIZE: 42
│   │   ├── tamil nadu, nadu, tamil, man, man held | SIZE: 181
│   │   ├── man, held, arrested, woman, wife | SIZE: 1074
│   │   │   ├── injured, driver, accident, truck, wait | SIZE: 149
│   │   │   ├── wife, man, kills, death, son | SIZE: 145
│   │   │   ├── complaint, dogs, freshworks stealing, accuses, kin | SIZE: 27
│   │   │   ├── shot, rescued, loses, leopard, trader | SIZE: 70
│   │   │   ├── dies, tiger, infected, minati mishra, couple | SIZE: 63
│   │   │   ├── arrested, held, murder, murder case, woman | SIZE: 269
│   │   │   ├── mumbai, bail, drug, arrest, bank | SIZE: 112
│   │   │   └── man, woman, family, found, killed | SIZE: 239
│   │   ├── fir, jack dorsey, twitter chief, chief jack, quashes fir | SIZE: 13
│   │   ├── seized, worth, lakh, liquor, stolen 

# Textrank

In [33]:
def remove_redundant_words(text):
    result = []
    for i in text:
        temp = [j for j in i.split()]
        mylist = list(dict.fromkeys(temp))
        result.append(' '.join(mylist))
    return result

In [34]:
def pytextrank_extraction(text, words_per_theme=10):
    top_n_words = []
    doc = nlp(text)
    for p in doc._.phrases[:words_per_theme]:
        top_n_words.append(p.text)
    top_n_words = remove_redundant_words(top_n_words)
    return top_n_words

In [41]:
print_tree(tree_saved)

new covid cases, new covid-19 cases, new cases day, covid-19 death cases, covid cases, highest covid cases | SIZE: 10000
├── covid cases, covid positive case, murder case police, new corona cases, covid deaths days, new corona cases kerala | SIZE: 8762
│   ├── old man, wife bites man, bihar man, arrested mumbai police, death woman, bail man | SIZE: 1563
│   │   ├── liver 50-yr-old man, 57-yr-old school, 54-yr-old man, 17-yr-old girl, 80-yr-old gadag patient, 83-yr-old covid survivor | SIZE: 42
│   │   ├── wife tamil, booze tamil, kin tamil, tamil nadu readies, tamil nadu records, tamil nadu minister anbazhagan | SIZE: 181
│   │   ├── old man, wife bites man, arrested mumbai police, murder case police, pre-arrest bail second case, wife death | SIZE: 1074
│   │   │   ├── truck driver, injured accident, auto driver, truck driver damages barricades, surat rto trace1.5l auto drivers, highway accident | SIZE: 149
│   │   │   ├── wife death, wife bites man, wife cancer, pregnant wife, wife, o

In [45]:
print_tree(tree_saved)

new covid cases, new covid-19 cases, new cases day, covid-19 death cases | SIZE: 10000
├── covid cases, covid positive case, murder case police, new corona cases | SIZE: 8762
│   ├── old man, wife bites man, bihar man, arrested mumbai police | SIZE: 1563
│   │   ├── liver 50-yr-old man, 57-yr-old school, 54-yr-old man, 17-yr-old girl | SIZE: 42
│   │   ├── wife tamil, booze tamil, kin tamil, tamil nadu readies | SIZE: 181
│   │   ├── old man, wife bites man, arrested mumbai police, murder case police | SIZE: 1074
│   │   │   ├── truck driver, injured accident, auto driver, truck driver damages barricades | SIZE: 149
│   │   │   ├── wife death, wife bites man, wife cancer, pregnant wife | SIZE: 145
│   │   │   ├── can dogs, dogs, stray dogs, police complaint | SIZE: 27
│   │   │   ├── leopard attack, leopard attacks bagdogra, dead woman, leopard | SIZE: 70
│   │   │   ├── eturnagaram tiger reserve, faulty street lamps tiger reserve, tiger corridor, tiger | SIZE: 63
│   │   │   ├── murde