In [1]:
import sqlite3
import os, sys
import pandas as pd
from datetime import datetime 
import numpy as np
import pickle
import warnings

#data_dir = '/Users/xlx/Downloads/graph-data'
data_dir = '/home/xlx/d2/MicrosoftAcademicGraph'
conf_list = ['MM', 'CVPR', 'NIPS', 'ICML', 'IJCAI', 'PLDI']

paper_db = os.path.join(data_dir, 'Paper.db')

conn = sqlite3.connect(paper_db)
cur = conn.cursor()

load_ref = lambda fn: pd.read_table(fn, header=None, names=['PaperID', 'RefID'])

conf_file = os.path.join(data_dir, 'data_txt', 'ConferenceSeries.txt')
conf_df = pd.read_table(conf_file, header=None, names=['ConfID', 'Abbrv', 'FullName'])


In [2]:
## create a new table with 6 columns
"""
paper (id, conf/jnl, year)  ref (id, conf/jnl, year)
"""
# for each conference, 
#for c in conf_list[:1]:
c = conf_list[0]
row = conf_df.loc[conf_df['Abbrv'] == c]
conf_id = list(row['ConfID'])[0]

conf_paper_file = os.path.join(data_dir, 'papers.'+ c +'.txt')
df_paper = pd.read_table(conf_paper_file, header=None, 
                         names=['PaperID', 'TitleOrig', 'TitleNorm', 'PubYear', 'PubDate', 
                               'DOI', 'VenueOrig', 'VenueNorm', 'JournalID', 'ConfID', 'PaperRank' ])
df_paper.head()
set_paper = set(df_paper['PaperID'])

citing_file = os.path.join(data_dir, 'citing.'+c+'.txt')
df_citing = load_ref(citing_file)
cited_file = os.path.join(data_dir, 'cited.'+c+'.txt')
df_cited = load_ref(cited_file)
print ("{} conference {}: {} papers ({}-{})".format(datetime.now(), c, df_paper['PaperID'].count(), 
                                               df_paper['PubYear'].min(), df_paper['PubYear'].max()))
print ("\t citing {} papers, cited by {}".format(df_citing['PaperID'].count(), df_cited['PaperID'].count()))

# left joins for both the citing and cited
dfx_citing = df_citing.merge(df_paper[['PaperID', 'PubYear', 'ConfID']], on='PaperID', how='left') 
dfx_citing = dfx_citing.rename(columns = {'PubYear':'PaperPubYear', 'ConfID':"PaperConfID"})
dfx_citing['RefPubYear'] = 1000
dfx_citing['RefVenueID'] = 'AAAAaaaa'

dfx_cited = df_cited.merge(df_paper[['PaperID', 'PubYear', 'ConfID']], 
                           left_on="RefID", right_on='PaperID', how='left') 
dfx_cited.drop('PaperID_y', axis=1, inplace=True)
dfx_cited = dfx_cited.rename(columns = {'PubYear':'RefPubYear', 'ConfID':"RefConfID", "PaperID_x":"PaperID"})
dfx_cited['PaperPubYear'] = 1000
dfx_cited['PaperVenueID'] = 'AAAAaaaa'


2016-01-29 17:14:33.327059 conference MM: 3389 papers (1992-2015)
	 citing 47990 papers, cited by 68226


In [6]:
# go over the citing db
cnt = 0

for idx, row in dfx_citing.iterrows():
    cur.execute('SELECT * FROM paper_pruned WHERE id=?', (row['RefID'], ) )
    s = cur.fetchone() 
    dfx_citing['RefPubYear'][idx] = s[1]
    dfx_citing['RefVenueID'][idx] = s[2]
    cnt += 1
    if cnt % 1000 == 0 : # 2000000
        print('{} {:6.0f} / {:6.0f} records'.format(
                datetime.now(), cnt, df_citing['PaperID'].count() ) )
    if cnt >= 1e9: #1e9:
        break

print('{} {:6.0f} / {:6.0f} records. Done.\n\n'.format(
                datetime.now(), cnt, df_citing['PaperID'].count() ) )     

2016-01-29 17:30:15.606812   1000 /  47990 records
2016-01-29 17:31:16.129990   2000 /  47990 records
2016-01-29 17:32:04.096792   3000 /  47990 records
2016-01-29 17:33:02.861884   4000 /  47990 records
2016-01-29 17:34:05.344698   5000 /  47990 records
2016-01-29 17:35:00.227603   6000 /  47990 records
2016-01-29 17:35:45.700487   7000 /  47990 records
2016-01-29 17:36:32.280588   8000 /  47990 records
2016-01-29 17:37:21.454516   9000 /  47990 records
2016-01-29 17:38:09.362160  10000 /  47990 records
2016-01-29 17:39:21.506497  11000 /  47990 records
2016-01-29 17:40:13.707829  12000 /  47990 records
2016-01-29 17:41:04.785871  13000 /  47990 records
2016-01-29 17:42:03.520394  14000 /  47990 records
2016-01-29 17:43:04.439099  15000 /  47990 records
2016-01-29 17:44:03.653242  16000 /  47990 records
2016-01-29 17:45:07.323353  17000 /  47990 records
2016-01-29 17:46:02.799611  18000 /  47990 records
2016-01-29 17:46:57.406620  19000 /  47990 records
2016-01-29 17:47:55.153754  200

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
# go over the cited db
cnt = 0

for idx, row in dfx_cited.iterrows():
    cur.execute('SELECT * FROM paper_pruned WHERE id=?', (row['PaperID'], ) )
    s = cur.fetchone() 
    dfx_cited['PaperPubYear'][idx] = s[1]
    dfx_cited['PaperVenueID'][idx] = s[2]
    cnt += 1
    if cnt % 1000 == 0 : # 2000000
        print('{} {:6.0f} / {:6.0f} records'.format(
                datetime.now(), cnt, dfx_cited['PaperID'].count() ) )
    if cnt >= 1e9: #1e9:
        break

print('{} {:6.0f} / {:6.0f} records. Done.\n\n'.format(
                datetime.now(), cnt, dfx_cited['PaperID'].count() ) )

In [8]:
pickle.dump({"name":c, 'citing':dfx_citing, "cited":dfx_cited, "paper":df_paper}, 
           open(os.path.join(data_dir, 'cite_records.'+c+".pkl"), 'wb') ) 

In [None]:
ptr = 0 
line_cnt = 0
citing_cnt = [0, 0]
cited_cnt = [0, 0]
while ptr < len(paper_buf):        

    eol = paper_buf.find('\n', ptr)
    row = paper_buf[ptr:eol].split('\t')
    line_cnt += 1
    ptr = eol + 1
    """ paper table columns
        'PaperID', 'TitleOrig', 'TitleNorm', 'PubYear', 'PubDate', 
        'DOI', 'VenueOrig', 'VenueNorm', 'JournalID', 'ConfID', 'PaperRank'
    """

    cur_pid = row[0]
    r_ref = list(np.nonzero(dfx_citing['RefID'] == cur_pid)[0])
    r_paper  = list(np.nonzero(dfx_cited['PaperID'] == cur_pid)[0])
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
    
        if len(r_ref) >0:
            citing_cnt[0] += 1
            for rid in r_ref:
                # for each paper being cited by any paper in Conf
                dfx_citing['RefPubYear'][rid] = row[3]
                citing_cnt[1] += 1
                if row[9]: # conference
                    dfx_citing['RefVenueID'][rid] = row[9]
                elif row[8]: # journal
                    dfx_citing['RefVenueID'][rid] = row[8]

        if len(r_paper) >0:
            cited_cnt[0] += 1
            for rid in r_paper:
                cited_cnt[1] += 1
                # for each paper citing by any paper in Conf
                dfx_cited['PaperPubYear'][rid] = row[3]
                if row[9]: # conference
                    dfx_cited['PaperVenueID'][rid] = row[9]
                elif row[8]:
                    dfx_cited['PaperVenueID'][rid] = row[8]
        

    if line_cnt % 5000 == 0 : # 2000000
        print('{} {:9.0f} lines; citing {:6.0f}, {:6.0f} unique; {:6.0f} cited, {:6.0f} unique'.format(
                datetime.now(), line_cnt, citing_cnt[1], citing_cnt[0], cited_cnt[1], cited_cnt[0]) )
    if line_cnt >= 1e9: #1e9:
        break


pickle.dump({"name":c, 'citing':dfx_citing, "cited":dfx_cited, "paper":df_paper}, 
           os.path.join(data_dir, 'cite_records.'+c+".pkl"))
print('{} {:9.0f} lines; citing {:6.0f}, {:6.0f} unique; {:6.0f} cited, {:6.0f} unique\n\n'.format(
                datetime.now(), line_cnt, citing_cnt[1], citing_cnt[0], cited_cnt[1], cited_cnt[0]) )
