In [1]:
# Python requistes
from collections import Counter
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm

import numpy as np
import time
import pandas as pd
import os
import re

import pickle
import matplotlib.pyplot as plt


# Graph / Visualization
from networkx.drawing.nx_agraph import graphviz_layout
import networkx as nx
import pygraphviz


## Tweet preprocessor
import preprocessor as p

## NLTK tokenization / lemmatization
import nltk

from nltk.tokenize import treebank
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
from nltk.corpus import sentiwordnet as swn
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import opinion_lexicon

from nltk.wsd import lesk
from nltk.stem.wordnet import WordNetLemmatizer
import nltk.sentiment


In [2]:
synset_list = list(wn.all_synsets())
wordnet_graph_synset = nx.Graph(engine='sfdp', pack=True)

seen = set()
for ss in tqdm(synset_list):
    wordnet_graph_synset.add_node(ss.name())
    for lm in ss.lemmas():
        _lm = lm.name()
        if not _lm in seen:
            seen.add(_lm)
            wordnet_graph_synset.add_node(_lm)
        wordnet_graph_synset.add_edge(_lm, ss.name())

path = './wordnet_graph_synset.p'
nx.write_gpickle(wordnet_graph_synset, path)

100%|██████████| 117659/117659 [00:01<00:00, 81441.59it/s]


In [3]:
wordnet_graph_synset_cleaned = nx.Graph(engine='sfdp', pack=True)

for ss in tqdm(synset_list):
    wordnet_graph_synset_cleaned.add_node(ss.name())

for ss in tqdm(synset_list):
    nb_list = [k for m in [n for n in wordnet_graph_synset.neighbors(ss.name())] for k in wordnet_graph_synset.neighbors(m)]    
    for nb in nb_list:
        if wordnet_graph_synset_cleaned.has_edge(ss.name(), nb) == False:
            wordnet_graph_synset_cleaned.add_edge(ss.name(), nb)

path = './wordnet_graph_synset_cleaned.p'
nx.write_gpickle(wordnet_graph_synset_cleaned, path)

100%|██████████| 117659/117659 [00:00<00:00, 350459.37it/s]
100%|██████████| 117659/117659 [00:02<00:00, 58450.56it/s]


In [7]:
df = wn.synset('amazing.s.02').definition()

In [26]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import treebank
from nltk.tokenize import TweetTokenizer

In [16]:
df

'inspiring awe or admiration or wonder'

In [11]:
word_tokenize(df)

['inspiring', 'awe', 'or', 'admiration', 'or', 'wonder']

In [15]:
tb_tkn = treebank.TreebankWordTokenizer()
tb_tkn.tokenize(df)

['inspiring', 'awe', 'or', 'admiration', 'or', 'wonder']

In [30]:
tw_tkn = TweetTokenizer()
tw_tkn.tokenize(df)

['inspiring', 'awe', 'or', 'admiration', 'or', 'wonder']

In [27]:
# Negation

replace_dict = {
    "don't": "do not",
    "won't": "will not",
    "didn't": "did not",
    "doesn't": "does not",
    "can't": "can not",
    "couldn't": "could not",
    "isn't": "is not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "ain't": "is not",
    "aren't": "are not",
}

def replace_word(text):
    for word in replace_dict:
        if word in text:  # Small first letter
            text = text.replace(word, replace_dict[word])
        elif word[0].title() + word[1:] in text:  # Big first letter
            text = text.replace(word[0].title() + word[1:],
                                replace_dict[word][0].title() + replace_dict[word][1:])

    return text

def neg_tagging(word_list):
    string = ' '.join(word_list)
    transformed = re.sub(r'\b(?:not|never|no)\b[\w\s]+[^\w\s]', 
           lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)), 
           string,
           flags=re.IGNORECASE)
    
    return transformed

In [28]:
def negate(word_list):
    negged_sentence = neg_tagging(word_list)
    negged_tokens = negged_sentence.split()

    tokens = []
    token_pair_list = []
    
    for word in negged_tokens:
        negation = False
        if word.startswith('NEG_'):
            negation = True
            word = word[4:]
        token_pair_list.append((word, negation))
        tokens.append(word)
    
    return tokens, token_pair_list

In [31]:
df_tk = word_tokenize(replace_word(df))

In [32]:
df_tk

['inspiring', 'awe', 'or', 'admiration', 'or', 'wonder']

In [34]:
from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd import disambiguate
from pywsd.similarity import max_similarity

In [37]:
df_pair = disambiguate(df, cosine_lesk)

In [38]:
df_pair

[('inspiring', Synset('inspire.v.01')),
 ('awe', Synset('awe.n.01')),
 ('or', None),
 ('admiration', Synset('admiration.n.01')),
 ('or', None),
 ('wonder', Synset('wonder.n.01'))]

In [40]:
synset_list = list(wn.all_synsets())

In [45]:
from tqdm import tqdm

In [None]:
all_pairs_from_definition = []
for ss in tqdm(synset_list):
    df = ss.definition()
    df_pair = disambiguate(df, cosine_lesk)
    all_pairs_from_definition.extend(df_pair)

 25%|██▍       | 29110/117659 [43:31<2:12:24, 11.15it/s]