In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math
from nltk.stem.porter import PorterStemmer

# Create dataframe that matches paper abstract with subjects

In [2]:
paper_id = []
abstract = []

f = open('aminer_2013.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [3]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
df.set_index('id')

Unnamed: 0_level_0,abstract
id,Unnamed: 1_level_1
53e99796b7602d9701f5c805,Scientific publishing has become synonymous wi...
53e99796b7602d9701f5e179,Summary We present a model of succession in a ...
53e997a2b7602d9701f720e9,"The U.S. patent system is overdue for reform, ..."
53e997aab7602d9701f82f8c,Professor Barabási's talk described how the to...
53e997bdb7602d9701fab322,We characterize ``visual textures'' as realiza...
53e997c6b7602d9701fb400a,Brainbow is a genetic engineering technique th...
53e997c6b7602d9701fb8dab,Niels Bohr's (1885–1962) ► atomic model initia...
53e997cbb7602d9701fbb959,Patient safety is a global challenge that requ...
53e997d1b7602d9701fc73fd,Di- and tri-phosphate nucleotides are essentia...
53e997d7b7602d9701fcc002,This paper presents the design and optimizatio...


In [4]:
df.head()

Unnamed: 0,id,abstract
0,53e99796b7602d9701f5c805,Scientific publishing has become synonymous wi...
1,53e99796b7602d9701f5e179,Summary We present a model of succession in a ...
2,53e997a2b7602d9701f720e9,"The U.S. patent system is overdue for reform, ..."
3,53e997aab7602d9701f82f8c,Professor Barabási's talk described how the to...
4,53e997bdb7602d9701fab322,We characterize ``visual textures'' as realiza...


In [5]:
subject = pd.read_csv('paper_subject_match_subfield.csv',index_col = 'id')

In [6]:
tm = pd.merge(df, subject, on = ['id'])

In [7]:
tm.isnull().sum()

id          0
abstract    0
subfield    0
year        0
dtype: int64

In [8]:
len(tm)

242322

In [9]:
tm = tm.dropna()

In [10]:
tm = tm.drop(columns = ['id', 'year'])

In [11]:
tm.head()

Unnamed: 0,abstract,subfield
0,Scientific publishing has become synonymous wi...,1307
1,Summary We present a model of succession in a ...,1000
2,"The U.S. patent system is overdue for reform, ...",2719
3,Professor Barabási's talk described how the to...,1000
4,We characterize ``visual textures'' as realiza...,1705


In [12]:
tm['subfield'] = tm['subfield'].apply(np.int64)

In [13]:
len(tm.subfield.unique())

292

In [14]:
tm.head()

Unnamed: 0,abstract,subfield
0,Scientific publishing has become synonymous wi...,1307
1,Summary We present a model of succession in a ...,1000
2,"The U.S. patent system is overdue for reform, ...",2719
3,Professor Barabási's talk described how the to...,1000
4,We characterize ``visual textures'' as realiza...,1705


In [15]:
df = tm.groupby('subfield')['abstract'].apply(list).to_dict()

In [16]:
# Libraries for text preprocessing
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

In [17]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2015)

In [18]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [19]:
stemmer = SnowballStemmer('english')

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

#Function for sorting tf_idf in descending order
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [22]:
processed_docs = {}
for i in df.keys():
    for j in range(len(df[i])):
        processed_docs.setdefault(i, []).append(preprocess(df[i][j]))

In [23]:
df_take = {}
for i in processed_docs.keys():
    for j in range(len(processed_docs[i])):
        df_take.setdefault(i, []).append(' '.join(processed_docs[i][j]))

In [24]:
## do not run
#df_new = {}
#for i in processed_docs.keys():
    #for j in range(int(len(processed_docs[i])/2)):
        #df_new.setdefault(i, []).append(processed_docs[i][j])

In [25]:
import random

In [26]:
new_result = []
index_save = []
for i in df_take.keys():
    index_save.append(i)
    
for index, i in enumerate(df_take): 
    df_copy = df_take.copy()
    ex_index = [x for x in index_save if x !=i]
    for j in ex_index:
        if len(df_copy[j]) > 10:
            df_copy.update({j: random.sample(df_copy[j], 10)})
    
    if len(df_copy[i]) > 150:
        df_copy.update({i: random.sample(df_copy[i], 150)})
    
    for j in df_copy.keys():
        df_copy.update({j: ' '.join(df_copy[j])}) 
    
    
    corpus = []
    for j in df_copy.keys():
        corpus.append(df_copy[j])
    
    cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000)
    X=cv.fit_transform(corpus)


    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)
    # get feature names
    feature_names=cv.get_feature_names()

    # fetch document for which keywords needs to be extracted
    doc=corpus[index]

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    new_result.append([e for e in keywords.keys()])
    

    sorted_items.clear()
    keywords.clear()

In [27]:
#*********# SONG

In [28]:
all_word_list = [item for sublist in new_result for item in sublist]

In [29]:
proc_copy = processed_docs.copy()
for key in [key for key in df if len(proc_copy[key]) < 30]: del proc_copy[key]

In [30]:
train = []
import random
random.seed(2)
for i in proc_copy.keys():
    l = random.sample(range(len(proc_copy[i])), 30)
    for x in l:
        train.append(proc_copy[i][x])

In [31]:
new_doc = []
for i in range(len(train)):
    new_doc.append([k for k in train[i] if k in all_word_list])

In [32]:
from gensim import corpora, models
def LDA_TF_word(doc):
    dictionary = gensim.corpora.Dictionary(doc) 
    bow_corpus = [dictionary.doc2bow(d) for d in doc]
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4,minimum_probability=0)
    return(lda_model_tfidf)

In [33]:
word_list = new_doc.copy()

In [34]:
rep_model = LDA_TF_word(word_list)

In [35]:
for idx, topic in rep_model.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"activ" + 0.008*"patient" + 0.005*"risk" + 0.005*"data" + 0.005*"structur" + 0.005*"interact" + 0.005*"test" + 0.005*"level" + 0.005*"method" + 0.005*"treatment"
Topic: 1 Word: 0.015*"patient" + 0.008*"trial" + 0.007*"group" + 0.007*"clinic" + 0.006*"control" + 0.006*"sequenc" + 0.006*"treatment" + 0.006*"nerv" + 0.005*"cell" + 0.005*"intervent"
Topic: 2 Word: 0.013*"cell" + 0.008*"activ" + 0.006*"imag" + 0.006*"protein" + 0.006*"model" + 0.005*"patient" + 0.005*"express" + 0.005*"treatment" + 0.005*"compound" + 0.005*"speci"
Topic: 3 Word: 0.007*"group" + 0.006*"health" + 0.006*"method" + 0.006*"factor" + 0.006*"data" + 0.006*"intervent" + 0.005*"problem" + 0.005*"children" + 0.005*"mental" + 0.005*"nurs"
Topic: 4 Word: 0.017*"patient" + 0.009*"clinic" + 0.007*"care" + 0.007*"temperatur" + 0.007*"hospit" + 0.006*"sexual" + 0.006*"treatment" + 0.006*"injuri" + 0.006*"trauma" + 0.006*"cancer"
Topic: 5 Word: 0.013*"patient" + 0.009*"diabet" + 0.008*"level" + 0.008*"c

In [36]:
pd_copy = processed_docs.copy()

In [37]:
#dropped if less than 30
for key in [key for key in df if len(pd_copy[key]) < 30]: del pd_copy[key] 

In [38]:
#sample 30 from each subfield
sample_bysub = {}
import random
random.seed(2014)
for i in pd_copy.keys():
    ls = random.sample(range(len(pd_copy[i])), 30)
    for x in ls:
        sample_bysub.setdefault(i, []).append(pd_copy[i][x])

In [39]:
sample_bysub.keys()

dict_keys([1000, 1100, 1102, 1103, 1104, 1105, 1106, 1108, 1109, 1110, 1111, 1201, 1300, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1500, 1502, 1503, 1505, 1600, 1602, 1603, 1604, 1605, 1606, 1607, 1700, 1702, 1704, 1705, 1706, 1707, 1708, 1710, 1711, 1712, 1803, 1900, 1902, 1906, 1907, 1910, 1912, 2002, 2103, 2200, 2201, 2204, 2205, 2207, 2208, 2209, 2210, 2211, 2213, 2300, 2303, 2304, 2305, 2306, 2307, 2308, 2310, 2312, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2500, 2502, 2504, 2505, 2507, 2508, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2610, 2611, 2613, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2745, 2746, 2747, 2748, 2800, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2900, 2902, 2907, 2909, 2910, 2911, 2913, 2916, 2919,

In [40]:
len(sample_bysub[1000])

30

In [41]:
dictionary_rep = gensim.corpora.Dictionary(word_list)

In [42]:
def feed_models(text_str):
        bow_vector = dictionary_rep.doc2bow(text_str)
        sc = rep_model[bow_vector]
        score = list()
        for x in range(len(sc)):
            score.append(sc[x][1])
        return score

In [43]:
store_score = {}
for i in sample_bysub.keys():
    for j in range(len(sample_bysub[i])):
        score = feed_models(sample_bysub[i][j])
        store_score.setdefault(i, []).append(score)

In [44]:
import numpy as np
from scipy.optimize import minimize
from scipy.spatial.distance import cdist


def geometric_median(points, method='auto', options={}):
    """
    Calculates the geometric median of an array of points.
    method specifies which algorithm to use:
        * 'auto' -- uses a heuristic to pick an algorithm
        * 'minimize' -- scipy.optimize the sum of distances
        * 'weiszfeld' -- Weiszfeld's algorithm
    """

    points = np.asarray(points)

    if len(points.shape) == 1:
        # geometric_median((0, 0)) has too much potential for error.
        # Did the user intend a single 2D point or two scalars?
        # Use np.median if you meant the latter.
        raise ValueError("Expected 2D array")

    if method == 'auto':
        if points.shape[1] > 2:
            # weiszfeld tends to converge faster in higher dimensions
            method = 'weiszfeld'
        else:
            method = 'minimize'

    return _methods[method](points, options)


def minimize_method(points, options={}):
    """
    Geometric median as a convex optimization problem.
    """

    # objective function
    def aggregate_distance(x):
        return cdist([x], points).sum()

    # initial guess: centroid
    centroid = points.mean(axis=0)

    optimize_result = minimize(aggregate_distance, centroid, method='COBYLA')

    return optimize_result.x


def weiszfeld_method(points, options={}):
    """
    Weiszfeld's algorithm as described on Wikipedia.
    """

    default_options = {'maxiter': 1000, 'tol': 1e-7}
    default_options.update(options)
    options = default_options

    def distance_func(x):
        return cdist([x], points)

    # initial guess: centroid
    guess = points.mean(axis=0)

    iters = 0

    while iters < options['maxiter']:
        distances = distance_func(guess).T

        # catch divide by zero
        # TODO: Wikipedia cites how to deal with distance 0
        distances = np.where(distances == 0, 1, distances)

        guess_next = (points/distances).sum(axis=0) / (1./distances).sum(axis=0)

        guess_movement = np.sqrt(((guess - guess_next)**2).sum())

        guess = guess_next

        if guess_movement <= options['tol']:
            break

        iters += 1

    return guess


_methods = {
    'minimize': minimize_method,
    'weiszfeld': weiszfeld_method,
}

In [45]:
median_score = {}
for i in store_score.keys():
    median = geometric_median(store_score[i])
    median_score.setdefault(i, []).append(median)

In [46]:
from scipy.spatial import distance
dist_store = {}
for i in median_score.keys():
    sub_dist = {}
    for j in median_score.keys():
        dist = distance.euclidean(median_score[i], median_score[j])
        sub_dist.update({j:dist})
    dist_store.update({i:sub_dist})

In [47]:
l_df = sorted(subject["subfield"].unique().tolist())

In [48]:
df1 = pd.DataFrame()
df1["sub1"] = np.repeat(l_df,309).tolist()
df1["sub2"] = l_df*309

In [49]:
dist = []
for i in l_df:
    for j in l_df:
        if i in dist_store.keys():
            if j in dist_store[i].keys():
                dist.append(dist_store[i][j])
            else:
                dist.append(None)
        else:
            dist.append(None)

In [50]:
df1["distance"] = dist

In [53]:
econ_df = df1.loc[df1["sub1"] == 2002]
econ_df.sort_values(['distance'])

Unnamed: 0,sub1,sub2,distance
28830,2002,2002,0.000000
29012,2002,3312,0.086777
28999,2002,3207,0.129757
29001,2002,3301,0.154026
29007,2002,3307,0.179051
29005,2002,3305,0.184566
29017,2002,3317,0.205703
28850,2002,2213,0.236648
29006,2002,3306,0.238677
28996,2002,3204,0.258080


In [52]:
df1.to_csv("dist_2013")