In [491]:
import pandas as pd
import re

data = pd.read_csv('data.tsv', sep = '\t', keep_default_na = False, low_memory=False)
data_req = data[['article_title','authors','source_title','times_cited_all','times_cited_wos','180_days_usage','since_2013_usage','publication_year','number_of_pages','wos_categories','research_areas','highly_cited_status','hot_paper_status','funding_text']]
pd.options.mode.chained_assignment = None

In [39]:
import dill
dill.dump_session('notebook_env.db') # Save session to dump file

In [11]:
import dill
dill.load_session('notebook_env.db') # Restore sesion from dump file

In [31]:
# TODO Abstract preprocess

gensim_data = data.abstract.map(lambda x: re.sub(r'[^\w\s]', '', x))
gensim_data = gensim_data.map(lambda x: x.lower())
gensim_data.head()
gensim_data.to_csv('gensim_data_prepped.tsv', sep="\t")

In [None]:
# Lemmatization

import spacy

nlp = spacy.load("en_core_web_sm")
abstracts_isolated = gensim_data.abst.apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

In [None]:
# Initialize data preprocessing dataset, extract addresses and parties from publication metadata

from countrygroups import EUROPEAN_UNION

countries = EUROPEAN_UNION.names
countries.extend(['England','Wales','Scotland','Switzerland','Norway']) # Extend non-EU countries to match continental Europe
data_calc = data[['article_title','addresses']]
data_calc = data_calc[data_calc.addresses != ''] # Filter out publications with no address submitted
data_calc['addresses'] = data_calc['addresses'].str.replace(' *\[[^\]]*] ', '', regex=True) # Remove [Author/Authors]
data_calc['addresses'] = data_calc['addresses'].str.split(';') # Create a list of addresses
data_calc = data_calc.explode('addresses')
# data_calc = data_calc['addresses'].str.extractall(r'(?<=\] )(.*?)(?=\;|$)|(?=^[^\[])(.*?)(?=\;|$)').droplevel(1).drop(labels=1, axis=1)
data_calc['university'] = data_calc['addresses'].str.extract(r'^(.+?),')
data_calc['country'] = data_calc['addresses'].str.extract(r'^.*\, (.*)$')
# data_calc = data_calc[data_calc.groupby(level=0)['country'].transform(lambda x : x.isin(countries).any())]

In [523]:
# Match extracted parties to FT-list European business schools

from fuzzywuzzy import process

schools_ft = pd.read_csv('schools_ft.csv', header=None, sep='\t')
data_calc['bs']=data_calc['university'].apply(lambda x : [process.extractOne(x, schools_ft[0], score_cutoff=88)])

def extract_school(data): # NaN value fix
  res = []
  for bs in data:
    if bs[0] is not None:
      res.append(bs[0][0])
    else:
      res.append('')

  return res

data_calc['bs'] = extract_school(data_calc['bs'])

In [524]:
# Create binaries true/false for each party of each publication

import numpy as np

data_calc = data_calc.join(data[['funding_text']])

def bs_tf(row): # Business school  
  if row['bs'] == '':
    return False
  else:
    return True

def univ_tf(row): # University 
  if 'Univ' in row['university'] or 'univ' in row['university']:
    return True
  else:
    return False

data_calc['bs_tf'] = data_calc.apply(bs_tf, axis=1) # Apply
data_calc['univ_tf'] = data_calc.apply(univ_tf, axis=1) # Apply

def univ_bs_tf(row): # Whether business school is university-based
  if row['bs_tf'] and row['univ_tf']:
    return True
  else:
    return False

def fund_tf(row): # Funding text
  if row['funding_text'] != '':
    return True
  else:
    return False

data_calc['univ_bs_tf'] = data_calc.apply(univ_bs_tf, axis=1) # Apply
data_calc['fund_tf'] = data_calc.apply(fund_tf, axis=1) # Apply

In [None]:
# Completion of parties data preparation

data_fin = pd.DataFrame({'count' : data_calc.groupby(level=0).size(), # Total parties involved
                    'bs_count': data_calc.groupby(level=0)['bs_tf'].sum(), # Total FT-list business schools
                    'bs_bin': np.where(data_calc.groupby(level=0)['bs_tf'].sum() > 0, 1, 0), # T/F
                    'univ_count': data_calc.groupby(level=0)['univ_tf'].sum(), # Total universities
                    'univ_bin': np.where(data_calc.groupby(level=0)['univ_tf'].sum() > 0, 1, 0), # T/F
                    'univ_bs_count': data_calc.groupby(level=0)['univ_bs_tf'].sum(), # Total university-based FT-list
                    'univ_bs_bin': np.where(data_calc.groupby(level=0)['univ_bs_tf'].sum() > 0, 1, 0), # T/F
                    'fund_bin': np.where(data_calc.groupby(level=0)['fund_tf'].sum() > 0, 1, 0)}) # T/F on funding (value transfer)
data_fin['notuniv_bs_bin'] = np.where((data_fin.bs_bin == 1) & (data_fin.univ_bs_bin == 0), 1, 0) # Not university-based FT-list T/F

data_fin = data_req.join(data_fin, how='right') # Final prepared data

In [79]:
# Save calculated/matched business schools to all authors for all papers

data_calc.to_csv('data_calc_fuzzy.csv', sep = '\t', na_rep='')

In [103]:
# Load saved data

data_calc = pd.read_csv('data_calc_fuzzy.csv', sep = '\t')

def del_none(data): # Function in case of [None]
  res = []
  for bs in data:
    if bs != '[None]':
      res.append(bs)
    else:
      res.append('')

  return res

data_calc['bs'] = del_none(data_calc['bs']) # Fix in case of [None]

In [477]:
# Load previously lemmed abstracts

abstracts_isolated = pd.read_csv('abstracts_lemmed.tsv', sep='\t', index_col='Unnamed: 0')
abstracts_isolated.to_csv('abstracts_lemmed.tsv', sep = '\t')

In [478]:
# Prepare abstracts: tokenization and stop-word removal 

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

my_stop_words = STOPWORDS.union(set(['high','new','finding','theory','propose','article','set','focus','elsevi','consider','offer','present','suggest','test','impact','level','relationship','implication','support','suggest','analysis','subject','re','edu','use','research','study','result','firm','model','effect','problem','performance','datum','paper','information','examine','role','develop','increase','provide','relationship','author','affect','demonstrate','include','evidence','explore','literature','relate','base','investigate','manager','management','approach','strong','influence','practice','potential','framework','difference','argue','time','process','practice','low','empirical','issue','negative','measure','create','lead','sample','outcome','type','positive','discuss','associate','design','change','significant','period','factor','analyze','individual','organisation','organization','organisational','organizational','decision','right','elsevi','reserve','john','wiley','sons','shed','light','doi','jibs','highlight','importance','wide','range','international','journal','summary']))

def rm_stop(text):
  return [[word for word in simple_preprocess(str(doc)) if word not in my_stop_words] for doc in text]

abstracts_isolated = rm_stop(abstracts_isolated.abst.str.split(' '))

In [575]:
# Initialize model dataset and apply content filters

model_calc = data_fin.join(pd.DataFrame({'abst': abstracts_isolated}))
model_calc = model_calc.join(data[['doi','article_title']], rsuffix='_data')

model_calc = model_calc[~model_calc.abst.isin([['nan']])] # Empty abstracts removal
doi_filter = ['10.1002/smj.499','10.1007/s10551-009-0354-z','10.1177/1042258719890991','10.1287/mnsc.2020.3636','10.25300/MISQ/2018/11914','10.1287/mnsc.2020.3587','10.1007/s10551-010-0523-0','10.1007/s10551-010-0497-y','10.1007/s10551-010-0663-2','10.1007/s10551-010-0697-5']
model_calc = model_calc[~model_calc.doi.isin(doi_filter)] # Duplicates by DOI removal (hand-picked)
title_filter = ['Political Cycles and Stock Returns', 'Firm Volatility in Granular Networks','Copyrights and Creativity: Evidence from Italian Opera in the Napoleonic Age','Productivity and Organization in Portuguese Firms','Occupational Licensing and Maternal Health: Evidence from Early Midwifery Laws','Equilibrium Labor Market Search and Health Insurance Reform',"The Influence of Retail Management's Use of Social Power on Corporate Ethical Values, Employee Commitment, and Performance",'You Support Diversity, But Are You Ethical? Examining the Interactive Effects of Diversity and Ethical Climate Perceptions on Turnover Intentions','Advertising Spending and Media Bias: Evidence from News Coverage of Car Safety Recalls']
model_calc = model_calc[~model_calc.article_title.isin(title_filter)] # Duplicates by title removal (hand-picked)
model_calc = model_calc.drop('article_title_data', axis = 1)

model_calc['publication_year'].loc[model_calc.publication_year == 0] = 2022 # Make early-access a 2022 publication
model_calc = model_calc[model_calc.publication_year >= 2005] # Year filtering >2005

In [576]:
# Extract wos categories and turn into area binaries

model_calc.research_areas = model_calc.research_areas.str.split('; ') # Break RA into lists
model_calc.wos_categories = model_calc.wos_categories.str.split('; ') # Break WOS cat into lists

def extract_wos_areas(data):
  data['index'] = data.index
  cat1 = data.explode('research_areas').pivot_table(index='index', columns='research_areas', aggfunc="size", fill_value=0)
  cat2 = data.explode('wos_categories').pivot_table(index='index', columns='wos_categories', aggfunc='size', fill_value=0)


  areas = cat1.join(cat2, rsuffix='_2') # Compose WOS area dataset

  areas = areas.join(pd.DataFrame({ # Compute 3/5 areas based on categories
    'phys_sci': sum([areas['Mathematics'],areas['Mathematics, Interdisciplinary Applications'],areas['Statistics & Probability']]),
    'soc_sci': sum([areas['Business & Economics'],areas['Mathematical Methods In Social Sciences'],areas['Psychology'],areas['Social Sciences - Other Topics'],areas['Business'],areas['Business, Finance'],areas['Economics'],areas['Ethics'],areas['Management'],areas['Psychology, Applied'],areas['Psychology, Social'],areas['Social Sciences, Interdisciplinary'],areas['Social Sciences, Mathematical Methods']]),
    'tech': sum([areas['Computer Science'],areas['Engineering'],areas['Information Science & Library Science'],areas['Information Science & Library Science_2'],areas['Operations Research & Management Science'],areas['Operations Research & Management Science_2'],areas['Computer Science, Information Systems'],areas['Engineering, Manufacturing']])
  }))
  areas['phys_sci'], areas['soc_sci'], areas['tech'] = np.where(areas.phys_sci > 0, 1, 0), np.where(areas.soc_sci > 0, 1, 0), np.where(areas.tech > 0, 1, 0)
  return areas

model_calc = model_calc.join(extract_wos_areas(model_calc)).drop('index', axis=1) # Join with model dataset

In [577]:
# Save for transfer

model_calc.to_csv('data_to_model.csv', sep = '\t')

In [582]:
aaa = model_calc.groupby(['publication_year','soc_sci','phys_sci','tech']).size()

In [480]:
# Initialize gensim model from publiactions finalized dataset

import gensim.corpora as corpora
from gensim.models import ldamodel, CoherenceModel, LdaMulticore, Phrases, phrases

id2word = corpora.Dictionary(model_calc.abst)
corpus = [id2word.doc2bow(text) for text in model_calc.abst]
lda_model = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, chunksize=len(corpus), per_word_topics=True)

In [None]:
# Wordcloud of Top N words in each topic

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']
topics = lda_model.show_topics(20, 15, formatted=False)

cloud = WordCloud(stopwords=my_stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=15,
                  colormap='tab20',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

fig, axes = plt.subplots(4,5, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
# Word count and keyword importance for topics

from collections import Counter

data_flat = [w for w_list in model_calc.abst for w in w_list]
counter = Counter(data_flat)

def word_importance(data):
    out = []
    for i, topic in data:
        for word, weight in topic:
            out.append([word, i , weight, counter[word]])
    return out

df = pd.DataFrame(word_importance(topics), columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(4, 5, figsize=(16,10), sharey=True, dpi=160)

for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.020); ax.set_ylim(0, 25000)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.xaxis.set_ticks(df.loc[df.topic_id==i, 'word'])
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=90, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [410]:
# Extract a best matching topic for each publciation

def topic_assignment(data, id2word, ldamodel):
    top_topics = []
    for d in data.abst:
        bow = id2word.doc2bow(d)
        t = ldamodel.get_document_topics(bow)  
        top_topics.append(t[0][0])
    return top_topics

model_calc['topic'] = topic_assignment(model_calc.abst, id2word, lda_model)

In [548]:
# Save as final

model_calc.to_csv('data_to_model.csv', sep = '\t')

In [130]:
# Coherence analysis to achieve N perfect topics

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel, LdaMulticore

num_topics = list(range(10,14)[1:])

def multiple_models_init(topicnumberlist, corpus=corpus, id2word=id2word):
  LDA_models = {}
  for i in topicnumberlist:
    print(i, end=' ')
    LDA_models[i] = LdaModel(corpus=corpus, id2word=id2word, num_topics=i, eval_every=1, chunksize=len(corpus), passes=2, random_state=42)
    print('model built')
    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=15,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]

  return LDA_models, LDA_topics

LDA_models, LDA_topics = multiple_models_init(num_topics, corpus, id2word)
  

4 model built
5 model built
6 model built
7 model built
8 model built
9 model built
10 model built
11 model built
12 model built
13 model built


In [131]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

def lda_stability(data, topicnumberlist):
    LDA_stability = {}
    for i in range(0, len(topicnumberlist)-1):
        jaccard_sims = []
        for t1, topic1 in enumerate(data[topicnumberlist[i]]): # pylint: disable=unused-variable
            sims = []
            for t2, topic2 in enumerate(data[topicnumberlist[i+1]]): # pylint: disable=unused-variable
                sims.append(jaccard_similarity(topic1, topic2))    
            jaccard_sims.append(sims)    
        LDA_stability[topicnumberlist[i]] = jaccard_sims
    return LDA_stability
                
mean_stabilities = [np.array(lda_stability(LDA_models,num_topics)[i]).mean() for i in num_topics[:-1]]
coherences = [CoherenceModel(model=LDA_models[i], texts=model_calc.abst, dictionary=id2word, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

def ideal_topics(coherence, ldastability, topicnumberlist):
    """
    Derives optimum number of topics based on model coherences and its topics overlap
    """
    coh_sta_diffs = [coherence[i] - ldastability[i] for i in range(0, len(topicnumberlist)-1)] # limit topic numbers to the number of keywords
    coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == max(coh_sta_diffs)]
    ideal_topic_num = topicnumberlist[coh_sta_max_idxs[0]] # choose less topics in case there's more than one max
    return ideal_topic_num

ideal_topic_num = ideal_topics(coherences, mean_stabilities, num_topics)

In [None]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

ax.set_ylim([0, max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))])
ax.set_xlim([3, num_topics[-1]])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()   