In [1]:
# RAKE (Rapid Automatic Keyword Extraction)
# Assumption: Keyword sequencies contain uncommon worlds that usually appear together

In [2]:
import pandas as pd
import nltk
from rake_nltk import Metric, Rake

nltk.download('punkt') # nltk uses punkt to tockenize the words in English

[nltk_data] Downloading package punkt to /Users/isavchuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text_list = ["Be yourself; everyone else is already taken.",
            "A room without books is like a body without a soul.",
            "Be the change that you wish to see in the world.",
            "If you tell the truth, you don't need to remember anything.",
            "Always forgive your enemies; nothing annoys them so much.",
            "It is better to be hated for what you are than yto be loved for what you are not.", 
            "Live as you were to die tomorrow. Learn as you were to live forever.",
            "In three words I can sum up everything I've learnt about life: it goes on."]

In [4]:
rake = Rake()

# Extracting keywords
rake.extract_keywords_from_sentences(text_list)

In [5]:
rake.get_ranked_phrases()

['room without books',
 'body without',
 'three words',
 'remember anything',
 'nothing annoys',
 'everyone else',
 'die tomorrow',
 'always forgive',
 'already taken',
 'live forever',
 'live',
 'yto',
 'world',
 'wish',
 'truth',
 'tell',
 'sum',
 'soul',
 'see',
 'need',
 'much',
 'loved',
 'like',
 'life',
 'learnt',
 'learn',
 'hated',
 'goes',
 'everything',
 'enemies',
 'change',
 'better']

In [6]:
rake.get_ranked_phrases_with_scores()

[(8.5, 'room without books'),
 (4.5, 'body without'),
 (4.0, 'three words'),
 (4.0, 'remember anything'),
 (4.0, 'nothing annoys'),
 (4.0, 'everyone else'),
 (4.0, 'die tomorrow'),
 (4.0, 'always forgive'),
 (4.0, 'already taken'),
 (3.5, 'live forever'),
 (1.5, 'live'),
 (1.0, 'yto'),
 (1.0, 'world'),
 (1.0, 'wish'),
 (1.0, 'truth'),
 (1.0, 'tell'),
 (1.0, 'sum'),
 (1.0, 'soul'),
 (1.0, 'see'),
 (1.0, 'need'),
 (1.0, 'much'),
 (1.0, 'loved'),
 (1.0, 'like'),
 (1.0, 'life'),
 (1.0, 'learnt'),
 (1.0, 'learn'),
 (1.0, 'hated'),
 (1.0, 'goes'),
 (1.0, 'everything'),
 (1.0, 'enemies'),
 (1.0, 'change'),
 (1.0, 'better')]

In [7]:
rake.get_word_frequency_distribution()

Counter({'everyone': 1,
         'else': 1,
         'already': 1,
         'taken': 1,
         'room': 1,
         'without': 2,
         'books': 1,
         'like': 1,
         'body': 1,
         'soul': 1,
         'change': 1,
         'wish': 1,
         'see': 1,
         'world': 1,
         'tell': 1,
         'truth': 1,
         'need': 1,
         'remember': 1,
         'anything': 1,
         'always': 1,
         'forgive': 1,
         'enemies': 1,
         'nothing': 1,
         'annoys': 1,
         'much': 1,
         'better': 1,
         'hated': 1,
         'yto': 1,
         'loved': 1,
         'live': 2,
         'die': 1,
         'tomorrow': 1,
         'learn': 1,
         'forever': 1,
         'three': 1,
         'words': 1,
         'sum': 1,
         'everything': 1,
         'learnt': 1,
         'life': 1,
         'goes': 1})

In [8]:
rake.get_word_degrees() # Can be defined as the sum of the co-occurences of the word with other words in the text

defaultdict(<function rake_nltk.rake.Rake._build_word_co_occurance_graph.<locals>.<lambda>()>,
            {'everyone': 2,
             'else': 2,
             'already': 2,
             'taken': 2,
             'room': 3,
             'without': 5,
             'books': 3,
             'like': 1,
             'body': 2,
             'soul': 1,
             'change': 1,
             'wish': 1,
             'see': 1,
             'world': 1,
             'tell': 1,
             'truth': 1,
             'need': 1,
             'remember': 2,
             'anything': 2,
             'always': 2,
             'forgive': 2,
             'enemies': 1,
             'nothing': 2,
             'annoys': 2,
             'much': 1,
             'better': 1,
             'hated': 1,
             'yto': 1,
             'loved': 1,
             'live': 3,
             'die': 2,
             'tomorrow': 2,
             'learn': 1,
             'forever': 2,
             'three': 2,
             'word

In [9]:
# It is possible to change the default behaviour of the RAKE object
rake= Rake(ranking_metric=Metric.WORD_FREQUENCY)

rake.extract_keywords_from_sentences(text_list)
rake.get_ranked_phrases_with_scores()

[(4.0, 'room without books'),
 (3.0, 'live forever'),
 (3.0, 'body without'),
 (2.0, 'three words'),
 (2.0, 'remember anything'),
 (2.0, 'nothing annoys'),
 (2.0, 'live'),
 (2.0, 'everyone else'),
 (2.0, 'die tomorrow'),
 (2.0, 'always forgive'),
 (2.0, 'already taken'),
 (1.0, 'yto'),
 (1.0, 'world'),
 (1.0, 'wish'),
 (1.0, 'truth'),
 (1.0, 'tell'),
 (1.0, 'sum'),
 (1.0, 'soul'),
 (1.0, 'see'),
 (1.0, 'need'),
 (1.0, 'much'),
 (1.0, 'loved'),
 (1.0, 'like'),
 (1.0, 'life'),
 (1.0, 'learnt'),
 (1.0, 'learn'),
 (1.0, 'hated'),
 (1.0, 'goes'),
 (1.0, 'everything'),
 (1.0, 'enemies'),
 (1.0, 'change'),
 (1.0, 'better')]

In [10]:
rake= Rake(ranking_metric=Metric.WORD_DEGREE)

rake.extract_keywords_from_sentences(text_list)
rake.get_ranked_phrases_with_scores()

[(11.0, 'room without books'),
 (7.0, 'body without'),
 (5.0, 'live forever'),
 (4.0, 'three words'),
 (4.0, 'remember anything'),
 (4.0, 'nothing annoys'),
 (4.0, 'everyone else'),
 (4.0, 'die tomorrow'),
 (4.0, 'always forgive'),
 (4.0, 'already taken'),
 (3.0, 'live'),
 (1.0, 'yto'),
 (1.0, 'world'),
 (1.0, 'wish'),
 (1.0, 'truth'),
 (1.0, 'tell'),
 (1.0, 'sum'),
 (1.0, 'soul'),
 (1.0, 'see'),
 (1.0, 'need'),
 (1.0, 'much'),
 (1.0, 'loved'),
 (1.0, 'like'),
 (1.0, 'life'),
 (1.0, 'learnt'),
 (1.0, 'learn'),
 (1.0, 'hated'),
 (1.0, 'goes'),
 (1.0, 'everything'),
 (1.0, 'enemies'),
 (1.0, 'change'),
 (1.0, 'better')]

In [11]:
# Let's use another dataset
dbpedia_df = pd.read_csv('./datasets/dbpedia/DBPEDIA_train.csv')

NUM_SAMPLES = 10000
dbpedia_df = dbpedia_df.sample(NUM_SAMPLES, random_state=1000, replace=False).reset_index(drop=True)
dbpedia_df.sample(5)

Unnamed: 0,text,l1,l2,l3
171,The 2009–10 Indonesia Super League U-21 Final ...,Event,SportsEvent,FootballMatch
4586,Susan Banks is a fictional character on NBC's ...,Agent,FictionalCharacter,SoapCharacter
827,The 2011 Central and Western District Council ...,Event,SocietalEvent,Election
2557,The 2015 American Athletic Conference Women's ...,Event,Tournament,SoccerTournament
7845,Robert L. Green (c. 1922 – 1997) was the fashi...,Agent,Artist,FashionDesigner


In [12]:
def get_documents_for_topic(df, label):
    topic_df = df.loc[df['l2']==label]
    print(topic_df.head())
    
    text="\n".join(topic_df['text'])
    return text

In [13]:
label_of_interest = 'Person'
data = get_documents_for_topic(dbpedia_df, label_of_interest)

                                                 text     l1      l2  \
5   John Warren Bettis (October 24, 1924 – June 23...  Agent  Person   
11  Aktisanes is a Nubian king who is mentioned by...  Agent  Person   
35  Frank Wilson Pritt III was a salesperson and p...  Agent  Person   
56  Sakyong Jamgon Mipham Rinpoche, Jampal Trinley...  Agent  Person   
65  Isabel Dalley (born 28 November 1997) is a Jam...  Agent  Person   

                l3  
5            Judge  
11         Monarch  
35  BusinessPerson  
56       Religious  
65     BeautyQueen  


In [14]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

def get_top_phrases(data, max_length=15):
    r = Rake(max_length=max_length)
    r.extract_keywords_from_text(data)
    pp.pprint(r.get_ranked_phrases_with_scores()[:20])

In [15]:
get_top_phrases(data)

[   (225.0, 'ព ្ រ ះ ប ា ទយស ោ វរ ្ ម ័ នទ ី ១'),
    (196.0, 'வ ீ ரன ் ச ு ந ் தரல ி ங ் கம ்),'),
    (154.60714285714286, 'ณ ั ฐพ ิ มล ฟาร ิ ด ้ า ว ั ลเลอร ์)'),
    (135.5, 'क ै प ् टन अब ् ब ा स अल ी)'),
    (116.66666666666667, 'ရ ှ င ် စန ္ ဒလင ်္ က ာ, pronounced'),
    (   104.93008365508366,
        'gabrielle de rochechouart de mortemart married claude leonor de damas '
        'marquis de thianges'),
    (104.17857142857143, 'ณ ั ฐพ ิ มล นาฏยล ั กษณ ์; rtgs'),
    (   94.75969756140029,
        'iga wyrwał ([ ˈiɡa ˈvɨrvaw ]; born 20 february 1989 ), also known'),
    (   94.63257575757576,
        'nur ibn mujahid ibn ‘ ali ibn ‘ abdullah al dhuhi suha'),
    (   84.10000000000001,
        'سعود الكبير بن عبد العزيز بن محمد آل سعود \u200e\u200e)'),
    (81.0, 'ʃɪ ̀ ɴ sàɴda ̰ lɪ ̀ ɴgà ])'),
    (   79.74636422806697,
        '\u200b[ valeʁi ʒiskaʁ destɛ ̃]; born 2 february 1926 ), also known'),
    (79.5, 'михаи ́ л ио ́ сифович гуре ́ вич'),
    (79.5, 'и ́ горь бори ́ сови