## Analysis retrieval, no plotting

### This notebook provides an example of the new functions that disentangle plotting from analyses. Now the first-pass analyses can be saved directly without rerunning.

In [44]:
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import sys
import datetime
sys.path.append("../../code")
from __future__ import division

In [63]:
# This is the new python file that doesn't perform plotting functions
from itc_cite_conditioning import *

In [5]:
# Load up dataframe
ref_df_min = pickle.load(open('../../data/ref_dataframe_min.pkl', 'rb'))
plos_df = pickle.load(open('../../data/plos_paper_dataframe.pkl', 'rb'))
cite_df = pickle.load(open('../../data/citation_dataframe.pkl', 'rb'))

In [6]:
result = ref_df_min.join(cite_df, on='reference_UT')
ref_df = result.join(plos_df, on='paper_UT')

In [7]:
# Removing null reference rows (8907763 rows vs 10848620 rows)
ref_df = ref_df.loc[(ref_df['reference_UT']!='-1')]

In [69]:
# Let's look at just Cell references for a test
df_ex = ref_df[ref_df['ref_j1'] == 'CELL']

In [70]:
# This is the aggregate function from itc_cite_conditioning that performs histogram, error, chi2, etc. retrieval. Feel free to test
k = retrieve_first_analysis(df_ex)
k

In [64]:
# Example of multilevel analysis retrieval. This function builds a dictionary based on the PLOS journal, reference journal, and a delta t
# Depending on what your needs are for the time constraint (e.g. loose or strict time windows, 365 days +- some tolerance or just from the previous year), then you can just adjust the query where sub_df is defined
def mass_itc_retrieval_journal(df, min_freq, year_span):
    '''Given a minimum frequency (how many times is this reference journal in our data set), a list of year differences to attempt, and a dataframe,
    return a dictionary of this data. This can then be converted to a dataframe if you prefer to work with that'''
    itc_dict = {}
    
    # Can change this constraint as well if you want to look at field or some other feature
    journals = list(df.groupby("ref_j1").filter(lambda x: len(x) > min_freq)['ref_j1'].value_counts().to_frame().index)
    
    # If you just want PLOS ONE data, simply uncomment the [0] at the end
    p_journals = list(df['plos_j1'].value_counts().to_frame().index)#[0]
    print('Obtained journal list: ', len(journals))
    for p_journ in p_journals:
        if p_journ not in itc_dict:
            itc_dict[p_journ] = {}
        else:
            pass
        count = 0
        for journ in journals:
            itc_dict[p_journ][journ] = {}
            for delta_t in year_span:
                
                sub_df = df[(df['ref_j1'] == journ) & (df['plos_j1'] == p_journ) & ((df['plos_pub_year']-df['ref_pub_year'])==delta_t)] # Change this query to what you need.
                hist_list_raw, count_totals, error_tracker, unique_papers_list, upper_bound, chisqs, p_vals = retrieve_first_analysis(sub_df)
                itc_dict[p_journ][journ][delta_t] = {'hist': hist_list_raw, 'occs': count_totals, 'error': error_tracker, 'unique': unique_papers_list, 'max': upper_bound, 'chi2': chisqs, 'pvals': p_vals}
                count+=1
                #print(count)
            if count%50 == 0:
                print(count)
        print(p_journ)
        print('-----')
    return itc_dict


In [65]:
# Example of running this query
journal_dict = mass_itc_retrieval_journal(ref_df, 5000, [1,2,3,4,5,6,7,8,9,10])

('Obtained journal list: ', 318)
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
PLOS ONE
-----
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
PLOS GENET
-----
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
PLO NE TR D
-----
50
100
150
200
250
300
350
400
4

In [66]:
# and then saving it
with open('../../data/journal_1_to_10_at_least_5000.pkl', 'wb') as handle:
    pickle.dump(journal_dict, handle, protocol = 2)