In [166]:
import pandas as pd
import re

data = pd.read_csv('data.tsv', sep = '\t', keep_default_na = False, low_memory=False)
data_req = data[['article_title','times_cited_all','times_cited_wos','180_days_usage','since_2013_usage','publication_year','number_of_pages','wos_categories','research_areas','highly_cited_status','hot_paper_status','funding_text']]
pd.options.mode.chained_assignment = None

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer

def delSW(text):
  stop_words = stopwords.words('english')
  stop_words.extend([',','.',';',':','(',')',"'",'s'])
  return [word for word in text if word not in stop_words]

def lem(text):
  wnl = WordNetLemmatizer()
  return [wnl.lemmatize(w) for w in text]

def stem(text):
  array = []
  for word in text:
    stemmer = SnowballStemmer('english')
    word = stemmer.stem(word)
    array.append(word)
  return array

nltk_data = data

# nltk_data['tokened'] = nltk_data.abstract.str.lower().apply(word_tokenize).apply(delSW)
# nltk_data['lemmed'] = nltk_data.tokened.apply(lem)
# nltk_data['stemmed'] = nltk_data.tokened.apply(stem)

nltk_data.to_csv('nltk_data_prepped.tsv', sep="\t")

In [7]:
from collections import Counter
Counter(''.join(str(v).replace('"','')
  .replace("'",'')
  .replace('[','')
  .replace(']','')
  .replace(',','') for v in nltk_data.lemmed).split()).most_common(10)

[('firm', 59073),
 ('study', 35965),
 ('s', 35157),
 ('model', 34665),
 ('effect', 33078),
 ('result', 29513),
 ('research', 25971),
 ('market', 25653),
 ('find', 24762),
 ('performance', 24605)]

In [3]:
gensim_data = data
gensim_data['abst'] = gensim_data.abstract.map(lambda x: re.sub(r'[^\w\s]', '', x))
gensim_data['abst'] = gensim_data.abst.map(lambda x: x.lower())
gensim_data.abst.head()
gensim_data.to_csv('gensim_data_prepped.tsv', sep="\t")

In [100]:
import gensim
import spacy
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words_gensim = stopwords.words('english')
stop_words_gensim.extend(['from', 'subject', 're', 'edu', 'use', 'research', 'study'])

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(text):
  return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_gensim] for doc in text]

nlp = spacy.load("en_core_web_sm")
abstracts_isolated = gensim_data.abst.apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

In [210]:
abstracts_isolated = pd.read_csv('abstracts_lemmed.tsv', sep = '\t').drop('Unnamed: 0', axis = 'columns')

In [103]:
abstracts_isolated_edited = abstracts_isolated.values.tolist()
abstracts_words_isolated = list(sent_to_words(abstracts_isolated_edited))
abstracts_words_isolated = remove_stopwords(abstracts_isolated_edited)

In [209]:
abstracts_isolated.to_csv('abstracts_lemmed.tsv', sep = '\t')

NameError: name 'abstracts_isolated' is not defined

In [104]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(abstracts_words_isolated)
corpus = [id2word.doc2bow(text) for text in abstracts_words_isolated]
lda_model = gensim.models.LdaMulticore(corpus = corpus, id2word = id2word, num_topics = 10)

In [None]:
from pyLDAvis import gensim_models as gensimvis
from pyLDAvis import enable_notebook, save_html
enable_notebook()

model = gensimvis.prepare(lda_model, corpus, id2word)
save_html(model, 'viz.html')
model

In [142]:
from countrygroups import EUROPEAN_UNION

countries = EUROPEAN_UNION.names
countries.extend(['England','Wales','Scotland','Switzerland','Norway'])
data_calc = data[['article_title','addresses']]
data_calc = data_calc['addresses'].str.extractall(r'(?<=\] )(.*?)(?=\;|$)').droplevel(1)
data_calc['university'] = data_calc[0].str.extract(r'^(.+?),')
data_calc['country'] = data_calc[0].str.extract(r'^.*\, (.*)$')
data_calc = data_calc[data_calc.groupby(level=0)['country'].transform(lambda x : x.isin(countries).any())]

In [143]:
from fuzzywuzzy import process

data_calc['bs']=data_calc['university'].apply(lambda x : [process.extractOne(x, schools_ft[0], score_cutoff=88)])

def extract_school(data):
  res = []
  for bs in data:
    if bs[0] is not None:
      res.append(bs[0][0])
    else:
      res.append(bs)

  return res

data_calc['bs'] = extract_school(data_calc['bs'])


In [169]:
data_calc = data_calc.join(data_req)

In [81]:
data_calc.to_csv('data_calc_init.csv', sep = '\t')

In [171]:
def bs_tf(row):
  if row['bs'] != [None]:
    return True
  else:
    return False

def univ_tf(row):
  if 'Univ' in row['university'] or 'univ' in row['university']:
    return True
  else:
    return False

def frn_tf(row):
  if 'USA' in row['country'] or 'U.S.' in row['country'] or 'Canada' in row['country'] or 'China' in row['country'] or 'Japan' in row['country'] or 'Australia' in row['country'] or 'Korea' in row['country'] or 'Singapore' in row['country'] or 'India' in row['country'] or 'Pakistan' in row['country']:
    return True
  else:
    return False

data_calc['bs_tf'] = data_calc.apply(bs_tf, axis=1)
data_calc['univ_tf'] = data_calc.apply(univ_tf, axis=1)
data_calc['frn_tf'] = data_calc.apply(frn_tf, axis=1)

def univ_bs_tf(row):
  if row['bs_tf'] and row['univ_tf']:
    return True
  else:
    return False

def fund_tf(row):
  if row['funding_text'] != '':
    return True
  else:
    return False

data_calc['univ_bs_tf'] = data_calc.apply(univ_bs_tf, axis=1)
data_calc['fund_tf'] = data_calc.apply(fund_tf, axis=1)

In [177]:
data_calc.to_csv('data_calc_stage1.csv', sep = '\t')

In [211]:
import numpy as np

data_binary = data_calc[['article_title','funding_text','bs_tf','univ_tf','frn_tf','univ_bs_tf','fund_tf']]
test = pd.DataFrame({'count' : data_binary.groupby(level=0).size(), 
                    'bs_count': data_binary.groupby(level=0)['bs_tf'].sum(),
                    'bs_bin': np.where(data_binary.groupby(level=0)['bs_tf'].sum() > 0, 1, 0),
                    'univ_count': data_binary.groupby(level=0)['univ_tf'].sum(),
                    'univ_bs_count': data_binary.groupby(level=0)['univ_bs_tf'].sum(),
                    'fund_bin': np.where(data_binary.groupby(level=0)['fund_tf'].sum() > 0, 1, 0),
                    'frn_bin': np.where(data_binary.groupby(level=0)['frn_tf'].sum() > 0, 1, 0)})

data_preregr = data_req.join(test, how='right').join(abstracts_isolated)
data_preregr.to_csv('data_preregr.csv', sep = '\t')

In [39]:
from fuzzywuzzy import fuzz

def extract_unique_schools(strings):
  uniques = []

  for string in strings:
    if not uniques:
      uniques.append(string)

    for unique in uniques:
      if fuzz.partial_ratio(unique.lower().strip(), string.lower().strip()) > 90:
        break
    else:
        uniques.append(string)

  return uniques

schools_unique = extract_unique_schools(schools)


In [53]:
schools_unique = pd.DataFrame(schools_unique).sort_values(0).reset_index(drop=True)
schools_unique.to_csv('schools_unique.csv', sep = '\t')

In [16]:
schools_ft = pd.read_csv('schools_ft.csv', sep = '\t', header = None)
schools_unique = pd.read_csv('schools_unique.csv', sep = '\t', index_col = None, names = ['university'])

In [None]:
regr_data = pd.read_csv('data_fuzzy_prepped.tsv', sep = '\t')

In [268]:
from fuzzywuzzy import process

regr_data['bs']=regr_data['university'].apply(lambda x : [process.extractOne(x, schools_ft[0], score_cutoff=88)])

def extract_school(data):
  res = []
  for bs in data:
    if bs[0] is not None:
      res.append(bs[0][0])
    else:
      res.append(bs)

  return res

regr_data['bs'] = extract_school(regr_data['bs'])

In [269]:
regr_data.to_csv('data_fuzzy_prepped.tsv', sep = '\t')