In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math
from nltk.stem.porter import PorterStemmer

# Libraries for text preprocessing
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)


#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2015)


stemmer = SnowballStemmer('english')

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np
from scipy.optimize import minimize
from scipy.spatial.distance import cdist

import random



# Create dataframe that matches paper abstract with subjects

In [2]:
subject = pd.read_csv('../data/by_year/paper_subject_match_subfield.csv',index_col = 'id')

In [3]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

#Function for sorting tf_idf in descending order
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [4]:
file_names = os.listdir("../data/by_year")
file_names = file_names[:26]
file_names[10]

for i in range(14,26):
    print(file_names[i])
    files = file_names[i-4:i+1]
    print(files)

aminer_2004.txt
['aminer_2000.txt', 'aminer_2001.txt', 'aminer_2002.txt', 'aminer_2003.txt', 'aminer_2004.txt']
aminer_2005.txt
['aminer_2001.txt', 'aminer_2002.txt', 'aminer_2003.txt', 'aminer_2004.txt', 'aminer_2005.txt']
aminer_2006.txt
['aminer_2002.txt', 'aminer_2003.txt', 'aminer_2004.txt', 'aminer_2005.txt', 'aminer_2006.txt']
aminer_2007.txt
['aminer_2003.txt', 'aminer_2004.txt', 'aminer_2005.txt', 'aminer_2006.txt', 'aminer_2007.txt']
aminer_2008.txt
['aminer_2004.txt', 'aminer_2005.txt', 'aminer_2006.txt', 'aminer_2007.txt', 'aminer_2008.txt']
aminer_2009.txt
['aminer_2005.txt', 'aminer_2006.txt', 'aminer_2007.txt', 'aminer_2008.txt', 'aminer_2009.txt']
aminer_2010.txt
['aminer_2006.txt', 'aminer_2007.txt', 'aminer_2008.txt', 'aminer_2009.txt', 'aminer_2010.txt']
aminer_2011.txt
['aminer_2007.txt', 'aminer_2008.txt', 'aminer_2009.txt', 'aminer_2010.txt', 'aminer_2011.txt']
aminer_2012.txt
['aminer_2008.txt', 'aminer_2009.txt', 'aminer_2010.txt', 'aminer_2011.txt', 'aminer_201

In [5]:
for file_i in range(15,26):
    paper_id = []
    abstract = []
    files = file_names[file_i-4:file_i+1]
    for file in files:
        f = open('../data/by_year/'+file,'r',encoding = 'utf8')
        f.readline()
        for i, line in enumerate(f):
                json_line = json.loads(line)
                if 'year' in json_line and 'keywords' in json_line and \
                'abstract' in json_line and 'lang' in json_line and \
                'references' in json_line and 'issn' in json_line:

                    if json_line['lang'] == 'en' :

                        ## store paper info, later use to get the subject of the paper
                        paper_id.append(json_line['id'])
                        abstract.append(json_line['abstract'])


        f.close()

    df= pd.DataFrame()
    df['id'] = paper_id
    df['abstract'] = abstract
    df.set_index('id')

    tm = pd.merge(df, subject, on = ['id'])

    tm = tm.dropna()

    tm = tm.drop(columns = ['id', 'year'])

    tm['subfield'] = tm['subfield'].apply(np.int64)


    df = tm.groupby('subfield')['abstract'].apply(list).to_dict()

    ## Sample 2000 articles first
    for key in df:
        df.update({key: random.sample(df[key],min(len(df[key]),2000))})

    processed_docs = {}
    for i in df.keys():
        print("For Year: "+str(file_i+1990)+"  For subfield: "+ str(i))
        for j in range(len(df[i])):
            processed_docs.setdefault(i, []).append(preprocess(df[i][j]))

    df_take = {}
    for i in processed_docs.keys():
        for j in range(len(processed_docs[i])):
            df_take.setdefault(i, []).append(' '.join(processed_docs[i][j]))

    new_result = []
    index_save = []
    for i in df_take.keys():
        index_save.append(i)

    for index, i in enumerate(df_take): 
        df_copy = df_take.copy()
        ex_index = [x for x in index_save if x !=i]
        for j in ex_index:
            if len(df_copy[j]) > 10:
                df_copy.update({j: random.sample(df_copy[j], 10)})

        if len(df_copy[i]) > 50:
            df_copy.update({i: random.sample(df_copy[i], min(len(df_copy[i]),150))})

        for j in df_copy.keys():
            df_copy.update({j: ' '.join(df_copy[j])}) 


        corpus = []
        for j in df_copy.keys():
            corpus.append(df_copy[j])

        cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000)
        X=cv.fit_transform(corpus)


        tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
        tfidf_transformer.fit(X)
        # get feature names
        feature_names=cv.get_feature_names()

        # fetch document for which keywords needs to be extracted
        doc=corpus[index]

        #generate tf-idf for the given document
        tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

        #sort the tf-idf vectors by descending order of scores
        sorted_items=sort_coo(tf_idf_vector.tocoo())
        #extract only the top n; n here is 20
        keywords=extract_topn_from_vector(feature_names,sorted_items,20)

        new_result.append([e for e in keywords.keys()])


        sorted_items.clear()
        keywords.clear()

    #*********# SONG

    all_word_list = [item for sublist in new_result for item in sublist]
    len(all_word_list)

    proc_copy = processed_docs.copy()
    for key in [key for key in df if len(proc_copy[key]) < 30]: del proc_copy[key]

    train = []
    import random
    random.seed(2)
    for i in proc_copy.keys():
        l = random.sample(range(len(proc_copy[i])), 30)
        for x in l:
            train.append(proc_copy[i][x])

    new_doc = []
    for i in range(len(train)):
        new_doc.append([k for k in train[i] if k in all_word_list])

    from gensim import corpora, models
    def LDA_TF_word(doc):
        dictionary = gensim.corpora.Dictionary(doc) 
        bow_corpus = [dictionary.doc2bow(d) for d in doc]
        tfidf = models.TfidfModel(bow_corpus)
        corpus_tfidf = tfidf[bow_corpus]
        lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=300, id2word=dictionary, passes=2, workers=4,minimum_probability=0)
        return(lda_model_tfidf)

    word_list = new_doc.copy()

    rep_model = LDA_TF_word(word_list)

    pd_copy = processed_docs.copy()

    #dropped if less than 30
    for key in [key for key in df if len(pd_copy[key]) < 30]: del pd_copy[key] 

    #sample 30-50 from each subfield
    sample_bysub = {}
    import random
    random.seed(2014)
    for i in pd_copy.keys():
        ls = random.sample(range(len(pd_copy[i])), min(len(pd_copy[i]),50))
        for x in ls:
            sample_bysub.setdefault(i, []).append(pd_copy[i][x])

    print("number of valid subfields: "+str(len(sample_bysub.keys())))

    dictionary_rep = gensim.corpora.Dictionary(word_list)

    def feed_models(text_str):
            bow_vector = dictionary_rep.doc2bow(text_str)
            sc = rep_model[bow_vector]
            score = list()
            for x in range(len(sc)):
                score.append(sc[x][1])
            return score

    store_score = {}
    for i in sample_bysub.keys():
        for j in range(len(sample_bysub[i])):
            score = feed_models(sample_bysub[i][j])
            store_score.setdefault(i, []).append(score)

    def geometric_median(points, method='auto', options={}):
        """
        Calculates the geometric median of an array of points.
        method specifies which algorithm to use:
            * 'auto' -- uses a heuristic to pick an algorithm
            * 'minimize' -- scipy.optimize the sum of distances
            * 'weiszfeld' -- Weiszfeld's algorithm
        """

        points = np.asarray(points)

        if len(points.shape) == 1:
            # geometric_median((0, 0)) has too much potential for error.
            # Did the user intend a single 2D point or two scalars?
            # Use np.median if you meant the latter.
            raise ValueError("Expected 2D array")

        if method == 'auto':
            if points.shape[1] > 2:
                # weiszfeld tends to converge faster in higher dimensions
                method = 'weiszfeld'
            else:
                method = 'minimize'

        return _methods[method](points, options)


    def minimize_method(points, options={}):
        """
        Geometric median as a convex optimization problem.
        """

        # objective function
        def aggregate_distance(x):
            return cdist([x], points).sum()

        # initial guess: centroid
        centroid = points.mean(axis=0)

        optimize_result = minimize(aggregate_distance, centroid, method='COBYLA')

        return optimize_result.x


    def weiszfeld_method(points, options={}):
        """
        Weiszfeld's algorithm as described on Wikipedia.
        """

        default_options = {'maxiter': 1000, 'tol': 1e-7}
        default_options.update(options)
        options = default_options

        def distance_func(x):
            return cdist([x], points)

        # initial guess: centroid
        guess = points.mean(axis=0)

        iters = 0

        while iters < options['maxiter']:
            distances = distance_func(guess).T

            # catch divide by zero
            # TODO: Wikipedia cites how to deal with distance 0
            distances = np.where(distances == 0, 1, distances)

            guess_next = (points/distances).sum(axis=0) / (1./distances).sum(axis=0)

            guess_movement = np.sqrt(((guess - guess_next)**2).sum())

            guess = guess_next

            if guess_movement <= options['tol']:
                break

            iters += 1

        return guess


    _methods = {
        'minimize': minimize_method,
        'weiszfeld': weiszfeld_method,
    }

    median_score = {}
    for i in store_score.keys():
        median = geometric_median(store_score[i])
        median_score.setdefault(i, []).append(median)

    from scipy.spatial import distance
    dist_store = {}
    for i in median_score.keys():
        sub_dist = {}
        for j in median_score.keys():
            dist = distance.euclidean(median_score[i], median_score[j])
            sub_dist.update({j:dist})
        dist_store.update({i:sub_dist})

    l_df = sorted(subject["subfield"].unique().tolist())

    df1 = pd.DataFrame()
    df1["sub1"] = np.repeat(l_df,309).tolist()
    df1["sub2"] = l_df*309

    dist = []
    for i in l_df:
        for j in l_df:
            if i in dist_store.keys():
                if j in dist_store[i].keys():
                    dist.append(dist_store[i][j])
                else:
                    dist.append(None)
            else:
                dist.append(None)

    # Save file

    df1["distance"] = dist
    df1.to_csv("../data/datadist/datadist_"+str(file_i+1990)+".csv")

For Year: 2005  For subfield: 1000
For Year: 2005  For subfield: 1100
For Year: 2005  For subfield: 1101
For Year: 2005  For subfield: 1102
For Year: 2005  For subfield: 1103
For Year: 2005  For subfield: 1104
For Year: 2005  For subfield: 1105
For Year: 2005  For subfield: 1106
For Year: 2005  For subfield: 1107
For Year: 2005  For subfield: 1108
For Year: 2005  For subfield: 1109
For Year: 2005  For subfield: 1110
For Year: 2005  For subfield: 1111
For Year: 2005  For subfield: 1201
For Year: 2005  For subfield: 1202
For Year: 2005  For subfield: 1204
For Year: 2005  For subfield: 1207
For Year: 2005  For subfield: 1211
For Year: 2005  For subfield: 1212
For Year: 2005  For subfield: 1300
For Year: 2005  For subfield: 1302
For Year: 2005  For subfield: 1303
For Year: 2005  For subfield: 1304
For Year: 2005  For subfield: 1305
For Year: 2005  For subfield: 1306
For Year: 2005  For subfield: 1307
For Year: 2005  For subfield: 1308
For Year: 2005  For subfield: 1309
For Year: 2005  For 

For Year: 2005  For subfield: 3309
For Year: 2005  For subfield: 3310
For Year: 2005  For subfield: 3312
For Year: 2005  For subfield: 3314
For Year: 2005  For subfield: 3316
For Year: 2005  For subfield: 3317
For Year: 2005  For subfield: 3318
For Year: 2005  For subfield: 3319
For Year: 2005  For subfield: 3320
For Year: 2005  For subfield: 3322
For Year: 2005  For subfield: 3400
For Year: 2005  For subfield: 3402
For Year: 2005  For subfield: 3403
For Year: 2005  For subfield: 3404
For Year: 2005  For subfield: 3500
For Year: 2005  For subfield: 3501
For Year: 2005  For subfield: 3504
For Year: 2005  For subfield: 3505
For Year: 2005  For subfield: 3506
For Year: 2005  For subfield: 3600
For Year: 2005  For subfield: 3602
For Year: 2005  For subfield: 3605
For Year: 2005  For subfield: 3607
For Year: 2005  For subfield: 3609
For Year: 2005  For subfield: 3612
For Year: 2005  For subfield: 3614
For Year: 2005  For subfield: 3616


  diff = np.log(self.expElogbeta)


number of valid subfields: 218
For Year: 2006  For subfield: 1000
For Year: 2006  For subfield: 1100
For Year: 2006  For subfield: 1101
For Year: 2006  For subfield: 1102
For Year: 2006  For subfield: 1103
For Year: 2006  For subfield: 1104
For Year: 2006  For subfield: 1105
For Year: 2006  For subfield: 1106
For Year: 2006  For subfield: 1107
For Year: 2006  For subfield: 1108
For Year: 2006  For subfield: 1109
For Year: 2006  For subfield: 1110
For Year: 2006  For subfield: 1111
For Year: 2006  For subfield: 1201
For Year: 2006  For subfield: 1202
For Year: 2006  For subfield: 1203
For Year: 2006  For subfield: 1204
For Year: 2006  For subfield: 1207
For Year: 2006  For subfield: 1211
For Year: 2006  For subfield: 1212
For Year: 2006  For subfield: 1300
For Year: 2006  For subfield: 1302
For Year: 2006  For subfield: 1303
For Year: 2006  For subfield: 1304
For Year: 2006  For subfield: 1305
For Year: 2006  For subfield: 1306
For Year: 2006  For subfield: 1307
For Year: 2006  For subf

For Year: 2006  For subfield: 3206
For Year: 2006  For subfield: 3207
For Year: 2006  For subfield: 3300
For Year: 2006  For subfield: 3301
For Year: 2006  For subfield: 3302
For Year: 2006  For subfield: 3303
For Year: 2006  For subfield: 3304
For Year: 2006  For subfield: 3305
For Year: 2006  For subfield: 3306
For Year: 2006  For subfield: 3307
For Year: 2006  For subfield: 3308
For Year: 2006  For subfield: 3309
For Year: 2006  For subfield: 3310
For Year: 2006  For subfield: 3311
For Year: 2006  For subfield: 3312
For Year: 2006  For subfield: 3314
For Year: 2006  For subfield: 3316
For Year: 2006  For subfield: 3317
For Year: 2006  For subfield: 3318
For Year: 2006  For subfield: 3319
For Year: 2006  For subfield: 3322
For Year: 2006  For subfield: 3400
For Year: 2006  For subfield: 3401
For Year: 2006  For subfield: 3402
For Year: 2006  For subfield: 3403
For Year: 2006  For subfield: 3404
For Year: 2006  For subfield: 3500
For Year: 2006  For subfield: 3501
For Year: 2006  For 

For Year: 2007  For subfield: 2805
For Year: 2007  For subfield: 2806
For Year: 2007  For subfield: 2807
For Year: 2007  For subfield: 2808
For Year: 2007  For subfield: 2809
For Year: 2007  For subfield: 2900
For Year: 2007  For subfield: 2901
For Year: 2007  For subfield: 2902
For Year: 2007  For subfield: 2905
For Year: 2007  For subfield: 2906
For Year: 2007  For subfield: 2907
For Year: 2007  For subfield: 2908
For Year: 2007  For subfield: 2909
For Year: 2007  For subfield: 2910
For Year: 2007  For subfield: 2911
For Year: 2007  For subfield: 2912
For Year: 2007  For subfield: 2913
For Year: 2007  For subfield: 2914
For Year: 2007  For subfield: 2916
For Year: 2007  For subfield: 2917
For Year: 2007  For subfield: 2919
For Year: 2007  For subfield: 2921
For Year: 2007  For subfield: 2922
For Year: 2007  For subfield: 2923
For Year: 2007  For subfield: 3000
For Year: 2007  For subfield: 3002
For Year: 2007  For subfield: 3003
For Year: 2007  For subfield: 3004
For Year: 2007  For 

For Year: 2008  For subfield: 2614
For Year: 2008  For subfield: 2700
For Year: 2008  For subfield: 2701
For Year: 2008  For subfield: 2702
For Year: 2008  For subfield: 2703
For Year: 2008  For subfield: 2704
For Year: 2008  For subfield: 2705
For Year: 2008  For subfield: 2706
For Year: 2008  For subfield: 2707
For Year: 2008  For subfield: 2708
For Year: 2008  For subfield: 2710
For Year: 2008  For subfield: 2711
For Year: 2008  For subfield: 2712
For Year: 2008  For subfield: 2713
For Year: 2008  For subfield: 2714
For Year: 2008  For subfield: 2715
For Year: 2008  For subfield: 2716
For Year: 2008  For subfield: 2717
For Year: 2008  For subfield: 2718
For Year: 2008  For subfield: 2719
For Year: 2008  For subfield: 2720
For Year: 2008  For subfield: 2721
For Year: 2008  For subfield: 2722
For Year: 2008  For subfield: 2723
For Year: 2008  For subfield: 2724
For Year: 2008  For subfield: 2725
For Year: 2008  For subfield: 2726
For Year: 2008  For subfield: 2727
For Year: 2008  For 

For Year: 2009  For subfield: 2003
For Year: 2009  For subfield: 2100
For Year: 2009  For subfield: 2102
For Year: 2009  For subfield: 2103
For Year: 2009  For subfield: 2105
For Year: 2009  For subfield: 2200
For Year: 2009  For subfield: 2201
For Year: 2009  For subfield: 2202
For Year: 2009  For subfield: 2203
For Year: 2009  For subfield: 2204
For Year: 2009  For subfield: 2205
For Year: 2009  For subfield: 2207
For Year: 2009  For subfield: 2208
For Year: 2009  For subfield: 2209
For Year: 2009  For subfield: 2210
For Year: 2009  For subfield: 2211
For Year: 2009  For subfield: 2213
For Year: 2009  For subfield: 2215
For Year: 2009  For subfield: 2300
For Year: 2009  For subfield: 2302
For Year: 2009  For subfield: 2303
For Year: 2009  For subfield: 2304
For Year: 2009  For subfield: 2305
For Year: 2009  For subfield: 2306
For Year: 2009  For subfield: 2307
For Year: 2009  For subfield: 2308
For Year: 2009  For subfield: 2310
For Year: 2009  For subfield: 2311
For Year: 2009  For 

For Year: 2010  For subfield: 1310
For Year: 2010  For subfield: 1311
For Year: 2010  For subfield: 1312
For Year: 2010  For subfield: 1313
For Year: 2010  For subfield: 1314
For Year: 2010  For subfield: 1315
For Year: 2010  For subfield: 1400
For Year: 2010  For subfield: 1401
For Year: 2010  For subfield: 1402
For Year: 2010  For subfield: 1403
For Year: 2010  For subfield: 1405
For Year: 2010  For subfield: 1406
For Year: 2010  For subfield: 1408
For Year: 2010  For subfield: 1500
For Year: 2010  For subfield: 1501
For Year: 2010  For subfield: 1502
For Year: 2010  For subfield: 1503
For Year: 2010  For subfield: 1504
For Year: 2010  For subfield: 1505
For Year: 2010  For subfield: 1506
For Year: 2010  For subfield: 1600
For Year: 2010  For subfield: 1601
For Year: 2010  For subfield: 1602
For Year: 2010  For subfield: 1603
For Year: 2010  For subfield: 1604
For Year: 2010  For subfield: 1605
For Year: 2010  For subfield: 1606
For Year: 2010  For subfield: 1607
For Year: 2010  For 

For Year: 2010  For subfield: 3314
For Year: 2010  For subfield: 3315
For Year: 2010  For subfield: 3316
For Year: 2010  For subfield: 3317
For Year: 2010  For subfield: 3318
For Year: 2010  For subfield: 3319
For Year: 2010  For subfield: 3320
For Year: 2010  For subfield: 3321
For Year: 2010  For subfield: 3322
For Year: 2010  For subfield: 3400
For Year: 2010  For subfield: 3401
For Year: 2010  For subfield: 3402
For Year: 2010  For subfield: 3403
For Year: 2010  For subfield: 3404
For Year: 2010  For subfield: 3500
For Year: 2010  For subfield: 3501
For Year: 2010  For subfield: 3503
For Year: 2010  For subfield: 3504
For Year: 2010  For subfield: 3505
For Year: 2010  For subfield: 3506
For Year: 2010  For subfield: 3600
For Year: 2010  For subfield: 3601
For Year: 2010  For subfield: 3602
For Year: 2010  For subfield: 3603
For Year: 2010  For subfield: 3605
For Year: 2010  For subfield: 3607
For Year: 2010  For subfield: 3609
For Year: 2010  For subfield: 3612
For Year: 2010  For 

For Year: 2011  For subfield: 2800
For Year: 2011  For subfield: 2801
For Year: 2011  For subfield: 2802
For Year: 2011  For subfield: 2803
For Year: 2011  For subfield: 2804
For Year: 2011  For subfield: 2805
For Year: 2011  For subfield: 2806
For Year: 2011  For subfield: 2807
For Year: 2011  For subfield: 2808
For Year: 2011  For subfield: 2809
For Year: 2011  For subfield: 2900
For Year: 2011  For subfield: 2901
For Year: 2011  For subfield: 2902
For Year: 2011  For subfield: 2904
For Year: 2011  For subfield: 2905
For Year: 2011  For subfield: 2906
For Year: 2011  For subfield: 2907
For Year: 2011  For subfield: 2908
For Year: 2011  For subfield: 2909
For Year: 2011  For subfield: 2910
For Year: 2011  For subfield: 2911
For Year: 2011  For subfield: 2912
For Year: 2011  For subfield: 2913
For Year: 2011  For subfield: 2914
For Year: 2011  For subfield: 2915
For Year: 2011  For subfield: 2916
For Year: 2011  For subfield: 2917
For Year: 2011  For subfield: 2919
For Year: 2011  For 

For Year: 2012  For subfield: 2503
For Year: 2012  For subfield: 2504
For Year: 2012  For subfield: 2505
For Year: 2012  For subfield: 2506
For Year: 2012  For subfield: 2507
For Year: 2012  For subfield: 2508
For Year: 2012  For subfield: 2600
For Year: 2012  For subfield: 2601
For Year: 2012  For subfield: 2602
For Year: 2012  For subfield: 2603
For Year: 2012  For subfield: 2604
For Year: 2012  For subfield: 2605
For Year: 2012  For subfield: 2606
For Year: 2012  For subfield: 2607
For Year: 2012  For subfield: 2608
For Year: 2012  For subfield: 2609
For Year: 2012  For subfield: 2610
For Year: 2012  For subfield: 2611
For Year: 2012  For subfield: 2612
For Year: 2012  For subfield: 2613
For Year: 2012  For subfield: 2614
For Year: 2012  For subfield: 2700
For Year: 2012  For subfield: 2701
For Year: 2012  For subfield: 2702
For Year: 2012  For subfield: 2703
For Year: 2012  For subfield: 2704
For Year: 2012  For subfield: 2705
For Year: 2012  For subfield: 2706
For Year: 2012  For 

For Year: 2013  For subfield: 1705
For Year: 2013  For subfield: 1706
For Year: 2013  For subfield: 1707
For Year: 2013  For subfield: 1708
For Year: 2013  For subfield: 1709
For Year: 2013  For subfield: 1710
For Year: 2013  For subfield: 1711
For Year: 2013  For subfield: 1712
For Year: 2013  For subfield: 1800
For Year: 2013  For subfield: 1802
For Year: 2013  For subfield: 1803
For Year: 2013  For subfield: 1804
For Year: 2013  For subfield: 1900
For Year: 2013  For subfield: 1901
For Year: 2013  For subfield: 1902
For Year: 2013  For subfield: 1903
For Year: 2013  For subfield: 1904
For Year: 2013  For subfield: 1905
For Year: 2013  For subfield: 1906
For Year: 2013  For subfield: 1907
For Year: 2013  For subfield: 1908
For Year: 2013  For subfield: 1909
For Year: 2013  For subfield: 1910
For Year: 2013  For subfield: 1911
For Year: 2013  For subfield: 1912
For Year: 2013  For subfield: 2000
For Year: 2013  For subfield: 2001
For Year: 2013  For subfield: 2002
For Year: 2013  For 

For Year: 2013  For subfield: 3609
For Year: 2013  For subfield: 3610
For Year: 2013  For subfield: 3611
For Year: 2013  For subfield: 3612
For Year: 2013  For subfield: 3614
For Year: 2013  For subfield: 3616
number of valid subfields: 258
For Year: 2014  For subfield: 1000
For Year: 2014  For subfield: 1100
For Year: 2014  For subfield: 1101
For Year: 2014  For subfield: 1102
For Year: 2014  For subfield: 1103
For Year: 2014  For subfield: 1104
For Year: 2014  For subfield: 1105
For Year: 2014  For subfield: 1106
For Year: 2014  For subfield: 1107
For Year: 2014  For subfield: 1108
For Year: 2014  For subfield: 1109
For Year: 2014  For subfield: 1110
For Year: 2014  For subfield: 1111
For Year: 2014  For subfield: 1201
For Year: 2014  For subfield: 1202
For Year: 2014  For subfield: 1203
For Year: 2014  For subfield: 1204
For Year: 2014  For subfield: 1207
For Year: 2014  For subfield: 1208
For Year: 2014  For subfield: 1210
For Year: 2014  For subfield: 1211
For Year: 2014  For subf

For Year: 2014  For subfield: 2913
For Year: 2014  For subfield: 2914
For Year: 2014  For subfield: 2915
For Year: 2014  For subfield: 2916
For Year: 2014  For subfield: 2917
For Year: 2014  For subfield: 2919
For Year: 2014  For subfield: 2921
For Year: 2014  For subfield: 2922
For Year: 2014  For subfield: 2923
For Year: 2014  For subfield: 3000
For Year: 2014  For subfield: 3001
For Year: 2014  For subfield: 3002
For Year: 2014  For subfield: 3003
For Year: 2014  For subfield: 3004
For Year: 2014  For subfield: 3005
For Year: 2014  For subfield: 3100
For Year: 2014  For subfield: 3101
For Year: 2014  For subfield: 3102
For Year: 2014  For subfield: 3103
For Year: 2014  For subfield: 3104
For Year: 2014  For subfield: 3105
For Year: 2014  For subfield: 3106
For Year: 2014  For subfield: 3107
For Year: 2014  For subfield: 3108
For Year: 2014  For subfield: 3109
For Year: 2014  For subfield: 3110
For Year: 2014  For subfield: 3200
For Year: 2014  For subfield: 3201
For Year: 2014  For 

For Year: 2015  For subfield: 2614
For Year: 2015  For subfield: 2700
For Year: 2015  For subfield: 2701
For Year: 2015  For subfield: 2702
For Year: 2015  For subfield: 2703
For Year: 2015  For subfield: 2704
For Year: 2015  For subfield: 2705
For Year: 2015  For subfield: 2706
For Year: 2015  For subfield: 2707
For Year: 2015  For subfield: 2708
For Year: 2015  For subfield: 2710
For Year: 2015  For subfield: 2711
For Year: 2015  For subfield: 2712
For Year: 2015  For subfield: 2713
For Year: 2015  For subfield: 2714
For Year: 2015  For subfield: 2715
For Year: 2015  For subfield: 2716
For Year: 2015  For subfield: 2717
For Year: 2015  For subfield: 2718
For Year: 2015  For subfield: 2719
For Year: 2015  For subfield: 2720
For Year: 2015  For subfield: 2721
For Year: 2015  For subfield: 2722
For Year: 2015  For subfield: 2723
For Year: 2015  For subfield: 2724
For Year: 2015  For subfield: 2725
For Year: 2015  For subfield: 2726
For Year: 2015  For subfield: 2727
For Year: 2015  For 

In [617]:
econ_df = df1.loc[df1["sub1"] == 2003]
econ_df.sort_values(['distance'])

Unnamed: 0,sub1,sub2,distance
29046,2003,1000,
29047,2003,1100,
29048,2003,1101,
29049,2003,1102,
29050,2003,1103,
29051,2003,1104,
29052,2003,1105,
29053,2003,1106,
29054,2003,1107,
29055,2003,1108,
