In [29]:
import pandas as pd
import locale
import numpy as np

In [30]:
# reading citations data
citations_df = pd.read_csv("../data/citations_data/thinktank_citations_tagged.csv", index_col=0)
# reading dataframe of think tanks
thinktank_df = pd.read_csv("../data/thinktank_data/thinktank_features.csv", index_col=0)

In [31]:
citations_df['thinktank'].unique().shape

(182,)

In [32]:
# converting date of publication to year
citations_df['year'] = citations_df['date'].apply(lambda x: pd.to_datetime(x).year)

In [33]:
# making think tank - year - publication level panel from citation data
# num_cites: number of citations to think tank during given year from given publication
panel = citations_df.groupby(['thinktank', 'year', 'pub'])[['title']] \
    .count().rename(columns={'title':'num_cites'}).reset_index()

In [1]:
# Function that takes in think tank name (tt) and year of publication and returns the budget of tt in that year
def add_budgets(tt, year):
    expenses_str = thinktank_df[(thinktank_df.thinktank==tt)][f'exp_{year}'].item()
    if pd.isna(expenses_str):
        return 0.0
    expenses = float(expenses_str.replace(",", "").split(".")[0])
    return expenses / 1000000

In [2]:
# returns the age of think tank tt during year
def add_age(tt, year):
    dob = thinktank_df[thinktank_df.thinktank==tt]['year_established'].item()
    return (year - dob)

In [37]:
# def add_lib_or_cons(tt):
#     cons = tt_df[tt_df.thinktank==tt]['conservative'].item()
#     lib = tt_df[tt_df.thinktank==tt]['libertarian'].item()
#     if not lib:
#         return int(cons)
#     else:
#         return int(max(cons, lib))

In [3]:
# returns a tuple indicating if think tank tt is conservative or libertarian
def add_lib_or_cons(tt):
    cons = thinktank_df[thinktank_df.thinktank==tt]['conservative'].item()
    lib = thinktank_df[thinktank_df.thinktank==tt]['libertarian'].item()
    return (cons, lib)

In [39]:
# adding think tank budget as a feature to the panel
panel['expenses'] = panel.apply(lambda x: add_budgets(x.thinktank, x.year), axis=1)

# adding think tank age as a feature to the panel
panel['age'] = panel.apply(lambda x: add_age(x.thinktank, x.year), axis=1)

In [40]:
con_lib_info = panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)
# adding conservative indicator as feature, 1 if think tank is conservative, 0 else
panel['conservative'] = con_lib_info.apply(lambda x:x[0])
# adding libertarian indicator as feature, 1 if think tank is libertarian, 0 else
panel['libertarian'] = con_lib_info.apply(lambda x:x[1])
# adding conservative or libertarian indicator as feature, 1 if think tank is conservative or libertarian, 0 else
panel['cons_lib'] = con_lib_info.apply(lambda x:max(x[0], x[1]))

In [42]:
# saving panel
panel.to_csv("../data/master_panel.csv")

# Making Topic Specific Panels

In [113]:
panel = pd.read_csv("./data/master_panel.csv")

In [85]:
# iterating over each topic area
for topic in ['econ','edu', 'health', 'pol']:
    # filtering citation data to only include topic-specific citations
    filt_df = citations_df[citations_df[f"{topic}_keyword"] == 1]
    
    # converting filtered dataframe to thinktank-year-publication level panel
    topic_panel = filt_df.groupby(['thinktank', 'year', 'pub'])[['title']].count().rename(columns={'title':'num_cites'}).reset_index()
    
    # adding expenses as feature to topic panel
    topic_panel['expenses'] = topic_panel.apply(lambda x: add_budgets(x.thinktank, x.year), axis=1)
    
    #adding age as feature to topic panel
    topic_panel['age'] = topic_panel.apply(lambda x: add_age(x.thinktank, x.year), axis=1)
    
    # adding indicator = 1 if think tank in topic panel is conservative or libertarian, 0 else
    topic_panel['con_lib'] = topic_panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)

    # saving topic panel
    topic_panel.to_csv(f"../data/panels/{topic}_panel.csv")

In [88]:
econ_panel = pd.read_csv("../data/panels/econ_panel.csv", index_col=0)
edu_panel = pd.read_csv("../data/panels/edu_panel.csv", index_col=0)
health_panel = pd.read_csv("../data/panels/health_panel.csv", index_col=0)
pol_panel = pd.read_csv("../data/panels/pol_panel.csv", index_col=0)