# Crunching the data

This code processes the raw data gathered with arxiv_tweets_get_data.ipynb . Result is a table indexed by id's of the papers that contains the count of tweets for every paper, link to the paper, it's title, scientific area and times of tweets about the paper.

Since the title and scientific area of the paper are obtained by accesing Arxiv API, it can take some time.

Resulting data is saved as data.json .

In [1]:
import pandas as pd
import os.path as op

#to acces Arxiv API and to parse its reply
import urllib2 as ul
import xml.etree.ElementTree as et

#load data
raw_data=pd.read_csv( op.join('data','raw_data.csv'),
                     names=['Arxiv_Id','Version','Time'],
                     dtype={'Arxiv_Id':str,'Version':int,'Time':pd.datetime})

d=pd.DataFrame(columns=['Title','Category','Area','Version','Link','Tweets','Times'])

#count tweets for every paper
d.Tweets=raw_data.Arxiv_Id.value_counts()

#find the highest version of the paper that occures in raw_data
d.Version=d.index.map(lambda x: raw_data[raw_data.Arxiv_Id==x]['Version'].max())

#construct links
d.Link='https://arxiv.org/abs/'+d.index.where(d.Version==0, d.index+'v'+d.Version.map(str))

#get the times
d.Times=d.index.map(lambda x: raw_data[raw_data.Arxiv_Id==x].Time.values)

In [2]:
#translation of category into area
cat_dict={'cond-mat': 'Physics',
          'math': 'Math',
          'cs': 'Computer Science',
          'q-bio': 'Biology',
          'stat' : 'Statistics',
          'astro-ph': 'Physics',
          'gr-qc': 'Physics',
          'hep-ex': 'Physics',
          'hep-lat': 'Physics',
          'hep-ph': 'Physics',
          'hep-th': 'Physics',
          'math-ph': 'Physics',
          'nucl-ex': 'Physics',
          'nucl-th': 'Physics',
          'physics': 'Physics',
          'quant-ph': 'Physics',
          'q-fin': 'Quantitative Finance',
          'nlin': 'Physics'
         }

#connects to Arxiv API, returns paper's title, category and area
#WARNING: may take some time for big tables- a lot of requests to Arxiv

def get_arxiv_info(link):
    querry='http://export.arxiv.org/api/query?id_list='+link.split('/')[-1]
    print 'Accesing: '+querry
    try:
        data = ul.urlopen(querry).read()    #get data via Arxiv API
    except Exception,e:
        print repr(e)
        return (None,None,None)
        
    root=et.fromstring(data)    #parse the data
    for elem in root.iter():
        if elem.tag=='{http://www.w3.org/2005/Atom}entry':
            entryelem=elem
            break
            
    category=None
    title=None
    for elem in entryelem.iter():
        if elem.tag=='{http://www.w3.org/2005/Atom}title':
            title= elem.text.replace('\n ','').replace('\n','')
        if 'category' in elem.tag and category==None:
            category=elem.attrib['term']
    
    try:
        area=cat_dict[category.split('.')[0]]    #translate the category
    except KeyError:
        area=None
    return pd.Series([title,area,category])

print 'There are '+str(len(d.index))+' querries to be made.'
d[['Title','Area','Category']]=d.Link.apply(get_arxiv_info)

d.to_json(op.join('data','data.json'))
print "\nFinished, final data saved as 'data/data.json' "

There are 827 querries to be made.
Accesing: http://export.arxiv.org/api/query?id_list=1701.04383
Accesing: http://export.arxiv.org/api/query?id_list=1404.1100
Accesing: http://export.arxiv.org/api/query?id_list=1701.04968v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04928v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.05130v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04862v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04831v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.05004v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04851v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04949v1
Accesing: http://export.arxiv.org/api/query?id_list=1412.1897v4
Accesing: http://export.arxiv.org/api/query?id_list=1701.04923v1
Accesing: http://export.arxiv.org/api/query?id_list=1701.04944
Accesing: http://export.arxiv.org/api/query?id_list=1701.05105v1
Accesing: http://export.arxiv.org/api/query?id_list=1606.01305v