# Crunching the data

This code processes the raw data gathered with arxiv_tweets_get_data.ipynb . Result is a table indexed by id's of the papers that contains the count of tweets for every paper, link to the paper, it's title, scientific area and times of tweets about the paper.

Since the title and category of the paper are obtained by accesing Arxiv API, it can take some time.

Resulting data is saved as data.json .

In [43]:
import pandas as pd
import os.path as op

#to acces Arxiv API and to parse its reply
import urllib2 as ul
import xml.etree.ElementTree as et

#to incorporate 3 seconds timem delay between Arxiv querries
#(https://arxiv.org/help/api/user-manual#_calling_the_api)
import time

#load data
raw_data=pd.read_csv( op.join('data','raw_data.csv'),
                     names=['Arxiv_Id','Version','Time'],
                     dtype={'Arxiv_Id':str,'Version':int,'Time':pd.datetime})

d=pd.DataFrame(columns=['Title','Category','Area','Version','Link','Tweets','Times'])

#count tweets for every paper
d.Tweets=raw_data.Arxiv_Id.value_counts()

#find the highest version of the paper that occures in raw_data
d.Version=d.index.map(lambda x: raw_data[raw_data.Arxiv_Id==x]['Version'].max())

#construct links
d.Link='https://arxiv.org/abs/'+d.index.where(d.Version==0, d.index+'v'+d.Version.map(str))

#get the times
d.Times=d.index.map(lambda x: raw_data[raw_data.Arxiv_Id==x].Time.values)

In [44]:
#connects to Arxiv API, changes the dataframe in place
#WARNING: may take some time for big tables, it's a lot of requests to Arxiv
#pause between requests: 3 seconds, as specified in the manual

def get_arxiv_info(frame):
    idlist=frame.Link.map(lambda x: x.split('/')[-1])
    querry='http://export.arxiv.org/api/query?id_list='+','.join(idlist)
    
    print 'Accesing: '+querry
    try:
        data = ul.urlopen(querry).read()    #get data via Arxiv API
    except Exception,e:
        print repr(e)
        return 
            
    root=et.fromstring(data)    #parse the data
    #for every entry
    for elem in root.findall('{http://www.w3.org/2005/Atom}entry'):
        category,title,arxiv_id=None,None,None
        #get category, id, and title
        for e in elem.iter():
            if e.tag=='{http://www.w3.org/2005/Atom}id':
                arxiv_id=e.text.split('/')[-1]
                if 'v' in arxiv_id:
                    arxiv_id=arxiv_id.split('v')[0]

            elif e.tag=='{http://www.w3.org/2005/Atom}title':
                title= e.text.replace('\n ','').replace('\n','')
                
            #there are two categories, I picked the first one, they should be the same
            elif 'category' in e.tag and category==None:
                category=e.attrib['term']
        
        if arxiv_id:
            frame.loc[arxiv_id,'Title']=title
            frame.loc[arxiv_id,'Category']=category  
        else:
            print 'Nonexistent paper!'
    
    print 'sleep...',
    time.sleep(3)
    print 'wake up\n'
    
# !! WARNING !!
#chunk size must be 10 or less, rest of the papers will not be returned

chunk_size=10
for i in range(0,len(d.index),chunk_size):
    get_arxiv_info(d.iloc[i:i+chunk_size, :])

if d.Title.isnull().any():
    print '!!! WARNING !!!\nSome titles are empty!'
    
if d.Category.isnull().any():
    print '!!! WARNING !!!\nSome categories are empty!'

print 'Finished loading data'

Accesing: http://export.arxiv.org/api/query?id_list=1701.04928v1,1701.04383,1404.1100,1701.04862v1,1701.02461,1701.04968v1,1701.04831v1,1701.05130v1,1701.05004v1,1701.04923v1
sleep... wake up

Accesing: http://export.arxiv.org/api/query?id_list=1701.04949v1,1701.05105v1,1701.04851v1,1606.01305v3,1412.1897v4,1701.04944v1,1701.04858,1701.04739v1,1701.04925v1,1701.05179
sleep... wake up

Accesing: http://export.arxiv.org/api/query?id_list=1701.05039,1701.04844,1701.03980,1701.05122,1604.05417v3,1701.05159v1,1701.05013v1,1701.04972,1701.04836,1701.02389
sleep... wake up

Accesing: http://export.arxiv.org/api/query?id_list=1701.05121v1,1602.00970v3,1701.05088v1,1701.03757v1,1602.02334v3,1701.05056,1701.05126,1701.04926v1,1701.04568,1701.04895
sleep... wake up

Accesing: http://export.arxiv.org/api/query?id_list=1701.05036,1701.05178,1701.04782v1,1701.04889v1,1701.04940v1,1701.05060,1610.01945v3,1701.04832,1701.04921v1,1701.05094
sleep... wake up

Accesing: http://export.arxiv.org/api/query?

In [48]:
#translate the category into area

cat_dict={'cond-mat': 'Physics',
          'math': 'Math',
          'cs': 'Computer Science',
          'q-bio': 'Biology',
          'stat' : 'Statistics',
          'astro-ph': 'Physics',
          'gr-qc': 'Physics',
          'hep-ex': 'Physics',
          'hep-lat': 'Physics',
          'hep-ph': 'Physics',
          'hep-th': 'Physics',
          'math-ph': 'Physics',
          'nucl-ex': 'Physics',
          'nucl-th': 'Physics',
          'physics': 'Physics',
          'quant-ph': 'Physics',
          'q-fin': 'Quantitative Finance',
          'nlin': 'Physics',
          'nucl': 'Physics'
         }

def translate(cat):
    try:
        area=cat_dict[cat.split('.')[0]]   
    except KeyError:
        area=None
    return area

d.Area=d.Category.apply(translate)
    
if d.Area.isnull().any():
    print '!!! WARNING !!!\nSome Areas are empty!'

d.to_json(op.join('data','data.json'))
print "\nFinished with translation of category to area, final data saved as 'data/data.json' "


Finished with translation of category to area, final data saved as 'data/data.json' 
