In [1]:
from collections import OrderedDict
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [2]:
text = pd.read_fwf("Data/tweety.txt", index=False)
text.head()

Unnamed: 0,0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,"""regardez """" Furie """" sur netflix , mon cÅ“ur ...",,,,,
1,"""sair ou ficar em casa na netflix""",,,,,
2,"""Digam me filmes de romance na Netflix para ch...",,,,,
3,"""Just finished watching @netflix #AmericanSon ...",,,,,
4,"""å¤§é˜ªã�§é£²ã‚“ã�§ã‚‹ã�¨ã��ã�«ã‚«ã‚¦ãƒ³ã‚¿ãƒ¼...",ï¼�,ã€�,ã�£ã�¦è¨€ã�£ã�¦ã�Ÿã�®ã�§,ã€�,"ã��ã�£ã�¡ã�®äººã�‹ã�£ã�¦æ€�ã�£ã�Ÿï½—"""


In [3]:
tweet_list = list(text['0'])
tweet_list

['"regardez "" Furie "" sur netflix , mon cÅ“ur en miettes lÃ\xa0"',
 '"sair ou ficar em casa na netflix"',
 '"Digam me filmes de romance na Netflix para chorar"',
 '"Just finished watching @netflix #AmericanSon makes me not want to have kids . You can do everything right , still be killed because of the way you look or who you hang with ."',
 '"å¤§é˜ªã�§é£²ã‚“ã�§ã‚‹ã�¨ã��ã�«ã‚«ã‚¦ãƒ³ã‚¿ãƒ¼ã�®éš£ã�«ã�„ã�Ÿã‚«ãƒƒãƒ—ãƒ«ã�®ç”·æ€§ã�Œè©±ã�—ã�‹ã�‘ã�¦ã��ã‚Œã�¦ ã€Œ åŒ—æµ·é�“ã�¨è¨€ã�ˆã�° ã€� æ°´æ›œã�©ã�†ã�§ã�—ã‚‡ã�†ã�§ã�™ã‚ˆã�\xad ï¼Ÿ Netflixã�§ä½•å›žã‚‚è¦‹ã�¦ã‚‹ ï¼� ã€� ã�£ã�¦è¨€ã‚�ã‚Œã�¦ ã€� ã�“ã�®ç•ªçµ„ã�®è²¢çŒ®åº¦ã�¯ã�™ã�”ã�„ã�ªã�¨æ€�ã�£ã�Ÿã‚“ã�\xa0ã�‘ã�© ã€Œ ãƒ�ãƒ£ãƒ³ãƒ�ãƒ«ã�¯ã��ã�®ã�¾ã�¾ã‚‚é�¢ç™½ã�‹ã�£ã�Ÿ',
 '"Long flight . Meeting prep or watch Netflix . How to decide ?"',
 '"hey eric , how many episodes you participate ? ?"',
 '"x4 sfa o netflix premium autopagable ."',
 '"AtÃ© que enfim , 3 temporada de the crown ! ! ! Obgda Netflix .."',
 '"@TenembaumOk Ernesto por favor vea y recomien

In [4]:
short_tweets = []
for i in range(50543):
    short_tweets.append(tweet_list[i])
short_tweets

['"regardez "" Furie "" sur netflix , mon cÅ“ur en miettes lÃ\xa0"',
 '"sair ou ficar em casa na netflix"',
 '"Digam me filmes de romance na Netflix para chorar"',
 '"Just finished watching @netflix #AmericanSon makes me not want to have kids . You can do everything right , still be killed because of the way you look or who you hang with ."',
 '"å¤§é˜ªã�§é£²ã‚“ã�§ã‚‹ã�¨ã��ã�«ã‚«ã‚¦ãƒ³ã‚¿ãƒ¼ã�®éš£ã�«ã�„ã�Ÿã‚«ãƒƒãƒ—ãƒ«ã�®ç”·æ€§ã�Œè©±ã�—ã�‹ã�‘ã�¦ã��ã‚Œã�¦ ã€Œ åŒ—æµ·é�“ã�¨è¨€ã�ˆã�° ã€� æ°´æ›œã�©ã�†ã�§ã�—ã‚‡ã�†ã�§ã�™ã‚ˆã�\xad ï¼Ÿ Netflixã�§ä½•å›žã‚‚è¦‹ã�¦ã‚‹ ï¼� ã€� ã�£ã�¦è¨€ã‚�ã‚Œã�¦ ã€� ã�“ã�®ç•ªçµ„ã�®è²¢çŒ®åº¦ã�¯ã�™ã�”ã�„ã�ªã�¨æ€�ã�£ã�Ÿã‚“ã�\xa0ã�‘ã�© ã€Œ ãƒ�ãƒ£ãƒ³ãƒ�ãƒ«ã�¯ã��ã�®ã�¾ã�¾ã‚‚é�¢ç™½ã�‹ã�£ã�Ÿ',
 '"Long flight . Meeting prep or watch Netflix . How to decide ?"',
 '"hey eric , how many episodes you participate ? ?"',
 '"x4 sfa o netflix premium autopagable ."',
 '"AtÃ© que enfim , 3 temporada de the crown ! ! ! Obgda Netflix .."',
 '"@TenembaumOk Ernesto por favor vea y recomien

In [5]:
# bin_list = [0, 1500, 3000, 5000,]
chunks = [short_tweets[x:x+5000] for x in range(0, len(short_tweets), 5000)]

In [6]:
# def stripped_chunks(chunk):
#     stripped = ''
#     for chunk in chunks:
#         stripped = str(chunk).strip('[]')
#     return stripped
# stripped_0 = stripped_chunks(chunks[0])
# stripped_1 = stripped_chunks(chunks[1])
# stripped_2 = stripped_chunks(chunks[2])
# stripped_3 = stripped_chunks(chunks[3])
# stripped_4 = stripped_chunks(chunks[4])
# stripped_5 = stripped_chunks(chunks[5])
# stripped_6 = stripped_chunks(chunks[6])
# stripped_7 = stripped_chunks(chunks[7])
# stripped_8 = stripped_chunks(chunks[8])
# stripped_9 = stripped_chunks(chunks[9])
# stripped_10 = stripped_chunks(chunks[10])

In [34]:
stripped_0 = str(chunks[0]).strip('[]')
stripped_1 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')
stripped_0 = str(chunks[0]).strip('[]')

TypeError: strip() takes at most 1 argument (2 given)

176

In [31]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_0, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Pizza - 1.0
Wings - 1.0
Netflix - 1.0
tonight - 1.0


In [8]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_0, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [9]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_1, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [10]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_2, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [11]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_3, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [12]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_4, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [13]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_5, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [14]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_6, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [15]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_7, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [16]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_8, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [17]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_9, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [18]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_10, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [19]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_0, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988


In [20]:
tr4w = TextRank4Keyword()
tr4w.analyze(stripped_0, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr4w.get_keywords(10)

Netflix - 84.51232754404933
â€ - 35.74255252174052
netflix - 31.680837105297247
# - 25.091104115822954
™ - 20.493762880468342
que - 20.091347090569318
¦ - 20.039556096555614
© - 19.880802552444027
de - 15.812756287611343
y - 13.375750760461575
� - 12.529456201487912
e - 12.259727091324988
