In [468]:
%matplotlib inline 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pickle

from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

In [377]:
index=0
prop,cat,ids,cycle,title,pi=[],[],[],[],[],[]
abst0 = [[] for _ in range(12000)]
lines=[]
with open("abstracts.cat") as infile:
    for i, line in enumerate(infile):
        lines.append(line)
        if i==32934:
            pass
        elif line.startswith('Prop. Type:'):
            prop.append(line[12:].rstrip())
        elif line.startswith('Category:'):
            cat.append(line[12:].rstrip())
        elif line.startswith('ID:   '):
            ids.append(line[12:].rstrip())
        elif line.startswith('Cycle:'):
            cycle.append(line[12:].rstrip())
        elif line.startswith('Title:'):
            title.append(line[11:].rstrip())
        elif line.startswith('\t'):
            title[-1]=title[-1]+' '+line[5:].rstrip()
        elif line.startswith('PI:'):
            pi.append(line[12:].rstrip())
        elif line.startswith('------------------------------------------------------------------------------\n'):
            index+=1
        else: 
            abst0[index].append(line.rstrip())
            
abst1 = [x for x in abst0 if x != []]            
abstr=[" ".join(ab) for ab in abst1]

len(prop),len(cat),len(ids),len(cycle),len(title),len(pi),len(abstr)

df=pd.DataFrame({'prop_type':prop,'category':cat,'id':ids,'cycle':cycle,'title':title,'pi':pi,'abstract':abstr})

In [380]:
a=-2
prop[a], cat[a], ids[a], cycle[a], title[a], pi[a], abstr[a]

('CAL/NIC',
 '',
 '9998',
 '12',
 'NICMOS Cycle 12 Grism Calibration and Standard Stars to 2.5microns',
 'Ralph Bohlin',
 ' This is the grism calibration proposal.')

In [383]:
df['prop_type'].unique()

array(['ENG/STIS/P', 'GO', 'CAL/AST', 'CAL/OTA', 'CAL/STIS', 'ENG/STIS',
       'GO/DD', 'CAL/ACS', 'CAL/NIC', 'CAL/WF2', 'GO/PAR', 'SNAP',
       'GTO/AST', 'GTO/OS', 'AR', 'GTO/FOS', 'GTO/ACS', 'ENG/SC',
       'ENG/ACS', 'ENG/NIC', 'ENG/FGS', 'NASA', 'ENG/WF2', 'GTO/HSP',
       'GTO/WFC', 'SM4/SC', 'ENG/AST', 'SM4/STIS', 'SM4/COS', 'SM4/WFC3',
       'SM4/ACS', 'GTO/HRS', 'SM4/NIC', 'AUG/HRS', 'SM4/FGS', 'SM4/ERO',
       'ENG/COS', 'GTO/COS', 'CAL/COS', 'CAL/WFC3', 'ENG/WFC3', 'GTO/FOC',
       'SV/FOS', 'SV/WFC', 'SV/OLT', 'OV/HRS', 'SV/HRS', 'OV/HSP', 'OV',
       'SV/HSP', 'SV/AST', 'OV/WFC', 'OV/FOS', 'OV/FOC', 'SV/FOC',
       'OV/OTA', 'OV/OLT', 'GST-5', 'GST/FOS', 'GST/AST', 'GST/WFC',
       'OV/GHRS', 'GST7', 'SV/MSC', 'GO/AM', 'CAL/FOC', 'CAL/FOS',
       'CAL/HRS', 'CAL/WFC', 'EROS', 'CAL/HSP', 'SV/GHRS', 'SAT/FOC',
       'SAT/FOS', 'SAT/AST', 'SAT/HSP', 'SAT/WFC', 'SAT/HRS', 'SNAP/SAT',
       'ERO/FOC', 'SV/OTA', 'SAT/OS', 'ENG', 'OV/AST', 'ENG/FOS',
       'ENG/OTA'

In [398]:
df.prop_type.value_counts()

GO            5263
AR             905
CAL/STIS       432
GO/DD          409
SNAP           339
CAL/WFC3       305
CAL/ACS        257
CAL/WF2        175
CAL/NIC        159
CAL/COS        127
GTO/STIS       106
CAL/AST        106
GTO/HRS        104
CAL/WFC         99
GTO/OS          90
GO/CAR          87
CAL/HRS         84
CAL/FOC         78
GTO/FOS         76
GO/PAR          71
GTO/AST         68
CAL/FOS         64
GTO/WF2         63
GTO/FOC         63
GTO/HSP         60
GTO/WFC         59
SM2/STIS        58
GTO/NIC         54
CAL/OTA         52
GTO/ACS         52
              ... 
SAT/HSP          2
ERO/FOC          2
ENG/WFC          2
                 2
CAL/PAR          1
RPT/GTO          1
SV/WFPC          1
OV/GHRS          1
ENG/MT           1
SV/GHRS          1
SM3/NIC/PA       1
SV               1
OV/OLT           1
GO/AST           1
SAT/AST          1
GST/AST          1
GST/WFC          1
GST-5            1
SV/OTA           1
CAL/FGS          1
CAL              1
RPT/OS      

In [474]:
df2=df.loc[[i.startswith('G') or i.startswith('A') for i in df.prop_type.values],:]; df2

Unnamed: 0,abstract,category,cycle,id,pi,prop_type,title
1,We propose to observe ultraluminous X-ray sou...,GALAXIES,12,10001,Philip Kaaret,GO,Locating Ultraluminous X-Ray Sources
2,We propose deep followup HST and Chandra obse...,AGN,12,10002,Eric Perlman,GO,Detailed Study of X-ray Jets from a Complete S...
3,We propose 4 new Chandra observations of NGC4...,GALAXIES,12,10003,Craig Sarazin,GO,Deep Chandra and Hubble Observations of NGC469...
4,Extended jets have been a key target for Chan...,AGN,12,10004,F. Tavecchio,GO,The Physics of Relativistic Jets: Chandra Imag...
5,We propose to extend our ongoing studies of t...,GALAXIES,12,10005,Walter Lewin,GO,A Uniform Study of Globular Cluster X-raySourc...
6,During A01-3 we found 22 Black Hole X-ray Nov...,GALAXIES,12,10006,Michael Garcia,GO,Black Hole X-ray Novae in M31
7,"We propose to obtain the first high-quality, ...",HOT STARS,12,10007,Michael Garcia,GO,The Spectral Energy Distribution of Cen X-4
8,"We propose two HST orbits, nearly simultaneou...",GALAXIES,12,10008,Jonathan Grindlay,GO,The Supersoft Source 1E1339.8+2837 and Globula...
9,We propose a deep survey in the second lowest...,GALAXIES,12,10009,Jonathan Grindlay,GO,Galactic Bulge Deep Survey
41,Our recent discovery of a planetary-sized bod...,SOLAR SYSTEM,12,10041,Michael Brown,GO/DD,Characterization of a planetary-sized body in ...


In [454]:
df.loc[['Frebel' in i for i in df.pi.values],:]

Unnamed: 0,abstract,category,cycle,id,pi,prop_type,title
1745,We propose to obtain near-UV HST/STIS spectro...,COOL STARS,17,11668,Anna Frebel,GO,Cosmo-chronometry and Elemental Abundance Dist...
3399,Relatively little is known about the chemical...,COOL STARS,21,13246,Anna Frebel,AR,The nucleosynthetic origins and chemical evolu...
4345,The nucleosynthetic signatures of the first s...,COOL STARS,23,14151,Anna Frebel,GO,Constraining Pop III supernova energies and th...


In [409]:
print df.loc[df.prop_type=='ENG/MT','abstract'].values

[ '  Simple guidestar handoff (SGSH) capability was delivered as part of SOGS build 32.0. This proposal will test SGSH on HST by making one PC iimage of a fast-moving asteroid, doing a handoff procedure, and repeating the image. Any drift in the asteroid position between the two images will be compared to the SOGS prediction. If the drift is greater, the SESD MOVT team will attempt to determine the reason and rectify the problem before operational use of this procedure.']


In [466]:
for i in df.loc[:,'pi'].unique()[0:10]:
    print i

Paul Goudfrooij
Philip Kaaret
Eric Perlman
Craig Sarazin
F. Tavecchio
Walter Lewin
Michael Garcia
Jonathan Grindlay
Edmund Nelan
Stefano Casertano


In [478]:
names=df2.loc[:,'pi'].unique(); print names, len(names)

['Philip Kaaret' 'Eric Perlman' 'Craig Sarazin' ..., 'Alister Graham'
 'Alex Lazarian' 'Andrew Phillips'] 2037


In [479]:
outname='author_list_r.pickle'
pickle.dump(names, open(outname , "wb" ) )
print outname

author_list_r.pickle
