In [1]:
# Import packages
%time
import os
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore") 
import numpy as np
import pandas as pd
import sklearn
import plotly.offline as py # Plotly imports
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import string # Libraries and packages for text (pre-)processing 
import re
import nltk
import matplotlib.pyplot as plt #Data Visualization 
import spacy
import gensim
import gensim.corpora as corpora
import tqdm
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

from pprint import pprint
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.decomposition import LatentDirichletAllocation as LDA # for LDA modeling
from sklearn.model_selection import train_test_split
from scipy.cluster import hierarchy

Wall time: 0 ns


In [2]:
# Import Data
df = pd.read_excel('../dataGlobaleVF - Copie.xlsx', dtype=str)

# Select article which contain at least XAI or bias or fairness
df_analyse = df[df['Supprime']!='0']

# Creat column that contains Title, Abstract and Keyword
df_analyse['data_tot'] = df_analyse['Abstract'] + " " + df_analyse['keywords']

In [3]:
# Capitalization / Lower case
def Lower(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: x.lower()) # works if no nan
    return data

# Remove url in the text
def remove_url(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

def remove_URL(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: remove_url(x))
    return data

# Removing html
def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

def remove_HTML(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: remove_html(x))
    return data

# Remove Non-ASCI:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text) # or ''.join([x for x in text if x in string.printable])

def remove_non_ASCII(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: remove_non_ascii(x))
    return data

# Remove special characters
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_special_Characters(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: remove_special_characters(x))
    return data

# Remove punctuations
def remove_punct(text):
    """
        Remove the punctuation
    """
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_Punct(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: remove_punct(x))
    return data
# Expand the contarctions I'll--> I will

In [4]:
%time
#Tokenization
# Function to tokenize more than one column
def Tokenize(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(word_tokenize)
    return data

# Removing stopword like 'the'
nltk.download("stopwords")
stop = set(stopwords.words('english'))
list_stopw_add = ['also','also','made','3d','Data','68w30 ','ai','voltage','machine','learning',
                  'intelligence','data','dataset','datasets','model','modeling','modelisation',
                  'using','system','artificial','covid19','training', 'film', 'temperature', 'copyright', 'set',
                 'from', 'subject', 're', 'edu', 'use',
                   'method', 'study','find', 'model', 'effect',
                   'show', 'approach', 'result', 'deep', 'network','propose', 'proposition', 'low',
                   'proposes', 'return', 'base', 'et', 'etc', 'high','paper', 'provide', 'intrus_detect',
                   'current', 'however', 'spinal_nerve','tax_evasion', 'cbia', 'galaxy', 'consonant', 
                   'virtualization', 'virtual', 'grasp', 'ulnar_nerve', 'time', 'proposition', 'propose',
                   'vote', 'voter', 'always', 'set', 'youtube', 'stutter', 'codon_usage', 'party', 'signal',
                   'system', 'increase', 'sequence', 'musical','music', 'expression']
[stop.add(mot) for _, mot in enumerate(list_stopw_add)]

def remove_stopword(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: [word for word in x if word not in stop])
    return data

# Stemming (regroup all word which have the same meaning) 
def snowball_stemmer(text):
    """
        Stem words in list of tokenized words with SnowballStemmer
    """
    stemmer = nltk.SnowballStemmer("english")
    stems = [stemmer.stem(i) for i in text]
    return stems

def snowball_STEMMER(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: snowball_stemmer(x))
    return data

# Lemmatization 
from nltk.stem import WordNetLemmatizer

def lemmatize_word(text):
    """
        Lemmatize the tokenized words
    """

    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word, tag) for word, tag in text]
    return lemma

# Test without POS Tagging
lemmatizer = WordNetLemmatizer()
def lemmatize_WORD(data, feature):
    for feat in feature:
        data[feat] = data[feat].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    return data

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X
max_features = 2**12
#X = vectorize(data[feature], max_features)

Wall time: 0 ns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mlndao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Text Cleaning
# Delete articles with nan
df_nan = df_analyse[~df_analyse['data_tot'].isnull()]
# Lower case
df_nan = Lower(df_nan,['data_tot'])
# Removing url
df_nan = remove_URL(df_nan,['data_tot'])
# Remove Non-ASCI:
df_nan = remove_HTML(df_nan,['data_tot'])
# Remove special characters
df_nan = remove_special_Characters(df_nan,['data_tot'])
# Remove punctuation
df_nan["data_tot"] = df_nan["data_tot"].apply(lambda x: remove_punct(x))

# Preprocessing
# Tokenizing the tweet base texts.
nltk.download('punkt')
from nltk.tokenize import word_tokenize
df_nan['data_tot_clean'] = df_nan['data_tot'].apply(word_tokenize)
# Removing stopword
df_nan = remove_stopword(df_nan, ['data_tot_clean'])
# Stemming (regroup all word which have the same meaning) 
nltk.download('wordnet')
nltk.download('omw-1.4')
#df_nan = snowball_STEMMER(df_nan, ['data_tot_clean'])
df_nan = lemmatize_WORD(df_nan, ['data_tot_clean'])
df_nan[['data_tot_clean', 'data_tot']]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mlndao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mlndao\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mlndao\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,data_tot_clean,data_tot
1,"[disparity, estimation, scene, complex, geomet...",disparity estimation for a scene with complex ...
3,"[smart, hydrogel, biomedical, application, hig...",smart hydrogels for biomedical applications ar...
12,"[deploying, small, cell, pico, femto, relay, m...",deploying small cells pico femto or relays in ...
13,"[participant, electricity, market, expect, fai...",participants in an electricity market expect t...
16,"[novel, paradigm, offer, highly, biologically,...",the novel deep learning paradigm offers a high...
...,...,...
24959,"[clinical, practice, algorithmic, prediction, ...",in clinical practice algorithmic predictions m...
24968,"[explainableinterpretable, able, make, reasoni...",an explainableinterpretable machine learning m...
24970,"[nowadays, become, fundamental, component, hea...",nowadays artificial intelligence ai has become...
24977,"[recent, surge, earthquake, engineering, metho...",the recent surge in earthquake engineering is ...


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df_nan.data_tot_clean.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['disparity', 'estimation', 'scene', 'complex', 'geometric', 'characteristic', 'slanted', 'highly', 'curved', 'surface', 'basic', 'important', 'issue', 'stereo', 'matching', 'traditional', 'method', 'often', 'firstorder', 'smoothness', 'prior', 'lead', 'lowcurvature', 'frontalparallel', 'disparity', 'map', 'stereo', 'framework', 'view', 'scene']


In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
#Additional cleang
#Stopword
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use',
                   'method', 'study','find', 'model', 'effect',
                   'show', 'approach', 'result', 'deep', 'network','propose', 'proposition', 'low',
                   'proposes', 'return', 'base', 'et', 'etc', 'high','paper', 'provide', 'intrus_detect',
                   'current', 'however', 'spinal_nerve','tax_evasion', 'cbia', 'galaxy', 'consonant', 
                   'virtualization', 'virtual', 'grasp', 'ulnar_nerve', 'time', 'proposition', 'propose',
                   'vote', 'voter', 'always', 'set', 'youtube', 'stutter', 'codon_usage', 'party', 'signal',
                   'system', 'increase', 'sequence', 'musical','music', 'expression'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

print(data_lemmatized[:1][0][:30])

['disparity', 'estimation', 'scene', 'complex', 'geometric', 'characteristic', 'slant', 'curve', 'surface', 'basic', 'important', 'issue', 'stereo', 'match', 'traditional', 'firstorder', 'smoothness', 'prior', 'lead', 'lowcurvature', 'frontalparallel', 'disparity', 'map', 'framework', 'view', 'scene', 'entity', 'compact', 'smooth', 'disparity']


In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 9), (23, 2), (24, 1), (25, 1), (26, 5), (27, 1), (28, 3), (29, 1)]


In [13]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# Grid search

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_ak.csv', index=False) # Save results
    pbar.close()

In [14]:
# Best model
num_topics = 8
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.91,
                                           eta=0.91)

In [17]:
# Print the Keyword in the 8 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.043*"bias" + 0.008*"cognitive" + 0.007*"attention" + 0.007*"participant" '
  '+ 0.006*"negative" + 0.006*"attentional" + 0.006*"individual" + '
  '0.005*"task" + 0.005*"find" + 0.005*"positive"'),
 (1,
  '0.002*"spam" + 0.002*"email" + 0.001*"election" + 0.001*"wage" + '
  '0.001*"corrupt" + 0.001*"spammer" + 0.001*"metarule" + '
  '0.001*"income_inequality" + 0.001*"skillbiase" + 0.001*"rd"'),
 (2,
  '0.019*"bias" + 0.006*"field" + 0.006*"magnetic" + 0.005*"property" + '
  '0.005*"exchange" + 0.005*"structure" + 0.004*"cell" + 0.004*"increase" + '
  '0.004*"device" + 0.004*"decrease"'),
 (3,
  '0.013*"feature" + 0.012*"propose" + 0.011*"performance" + '
  '0.010*"classification" + 0.009*"image" + 0.008*"problem" + 0.008*"accuracy" '
  '+ 0.007*"bias" + 0.007*"base" + 0.007*"neural"'),
 (4,
  '0.006*"traffic" + 0.006*"resource" + 0.005*"fairness" + '
  '0.005*"communication" + 0.004*"distribute" + 0.004*"game" + 0.004*"device" '
  '+ 0.004*"service" + 0.004*"user" + 0.004*"pr

In [18]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('results/abstract_kw/ldavis_tuned_ak_test'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word, R=25, sort_topics=False)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'results/abstract_kw/ldavis_tuned_ak_test'+ str(num_topics) +'.html')

LDAvis_prepared


the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses

