In [29]:
import pandas as pd
import locale
import numpy as np

In [30]:
df = pd.read_csv("./data/citations_data/thinktank_citations_tdm.csv", index_col=0)
tt_df = pd.read_csv("./data/thinktank_data/thinktank_features.csv", index_col=0)

In [31]:
df['thinktank'].unique().shape

(182,)

In [32]:
df['year'] = df['date'].apply(lambda x: pd.to_datetime(x).year)

In [33]:
panel = df.groupby(['thinktank', 'year', 'pub'])[['title']].count().rename(columns={'title':'num_cites'}).reset_index()

In [34]:
tt_df['thinktank'] = tt_df['thinktank'].apply(lambda x: 'Henry L. Stimson Center' if x == '38 North' else x)

In [35]:
def add_budgets(tt, year):
    expenses_str = tt_df[(tt_df.thinktank==tt)][f'exp_{year}'].item()
    if pd.isna(expenses_str):
        return 0.0
    expenses = float(expenses_str.replace(",", "").split(".")[0])
    return expenses / 1000000

In [36]:
def add_age(tt, year):
    dob = tt_df[tt_df.thinktank==tt]['year_established'].item()
    return (year - dob)

In [37]:
# def add_lib_or_cons(tt):
#     cons = tt_df[tt_df.thinktank==tt]['conservative'].item()
#     lib = tt_df[tt_df.thinktank==tt]['libertarian'].item()
#     if not lib:
#         return int(cons)
#     else:
#         return int(max(cons, lib))

In [38]:
def add_lib_or_cons(tt):
    cons = tt_df[tt_df.thinktank==tt]['conservative'].item()
    lib = tt_df[tt_df.thinktank==tt]['libertarian'].item()
    return (cons, lib)

In [39]:
panel['expenses'] = panel.apply(lambda x: add_budgets(x.thinktank, x.year), axis=1)
panel['age'] = panel.apply(lambda x: add_age(x.thinktank, x.year), axis=1)
#panel['con_lib'] = panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)

In [40]:
con_lib_info = panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)
panel['conservative'] = con_lib_info.apply(lambda x:x[0])
panel['libertarian'] = con_lib_info.apply(lambda x:x[1])
panel['cons_lib'] = con_lib_info.apply(lambda x:max(x[0], x[1]))

In [41]:
# for i in tt_df.columns:
#     if 'exp' in i:
#         print(i)
#         mean = tt_df[i].mean()
#         std = tt_df[i].std()
#         count = tt_df[i].notna().sum()
#         null = tt_df[i].isna().sum()
#         mini, maxi = tt_df[i].min(), tt_df[i].max()
#         print(mean, std, count, null, mini, maxi)

In [42]:
panel.to_csv("./data/master_panel.csv")

# Making Topic Specific Panels

In [113]:
panel = pd.read_csv("./data/master_panel.csv")
f_p = panel[panel.expenses < 1000]

In [84]:
df.head()

Unnamed: 0,file_path,title,date,sentence,pub,thinktank,sentence_clean,year,international_mention,econ_keyword,edu_keyword,health_keyword,pol_keyword
0,1906329834.xml,Trump Treads Into Feud Between Qatar and Saudis,2017-06-07,"""</p><p>Others analysts were more critical, sa...",NYT,Brookings Institution,"""Others analysts were more critical, saying th...",2017,Saudi Arabia,0,0,0,0
1,1928961341.xml,"Hartford, With Finances in Disarray, Veers Tow...",2017-08-16,"""</p><p>Connecticut has the greatest degree of...",NYT,Lincoln Institute of Land Policy,"""Connecticut has the greatest degree of income...",2017,United Kingdom,1,0,0,0
2,1785134438.xml,Biden Makes a Surprise Visit to Iraq,2016-04-29,"""Iraqi leaders can't afford to lose that sense...",NYT,Atlantic Council,"""Iraqi leaders can't afford to lose that sense...",2016,Iraq,0,0,0,0
3,1875295284.xml,Journalism or Propaganda? A Russian TV Network...,2017-03-09,"""</p><p>RT and Sputnik propel those campaigns ...",NYT,Atlantic Council,"""RT and Sputnik propel those campaigns by help...",2017,,0,0,0,0
4,1875295284.xml,Journalism or Propaganda? A Russian TV Network...,2017-03-09,"""I strongly suspect that RT Deutsch has a triv...",NYT,Council on Foreign Relations,"""I strongly suspect that RT Deutsch has a triv...",2017,Russian Federation,0,0,0,0


In [85]:
for topic in ['econ','edu', 'health', 'pol']:
    filt_df = df[df[f"{topic}_keyword"] == 1]
    topic_panel = filt_df.groupby(['thinktank', 'year', 'pub'])[['title']].count().rename(columns={'title':'num_cites'}).reset_index()
    topic_panel['expenses'] = topic_panel.apply(lambda x: add_budgets(x.thinktank, x.year), axis=1)
    topic_panel['age'] = topic_panel.apply(lambda x: add_age(x.thinktank, x.year), axis=1)
    topic_panel['con_lib'] = topic_panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)
#     con_lib_info = topic_panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)
#     topic_panel['cons'] = con_lib_info.apply(lambda x:x[0])
#     topic_panel['lib'] = con_lib_info.apply(lambda x:x[1])
#     topic_panel['cons_lib'] = con_lib_info.apply(lambda x:max(x[0], x[1]))
    topic_panel.to_csv(f"./data/panels/{topic}_panel.csv")

In [88]:
econ_panel = pd.read_csv("./data/panels/econ_panel.csv", index_col=0)
edu_panel = pd.read_csv("./data/panels/edu_panel.csv", index_col=0)
health_panel = pd.read_csv("./data/panels/health_panel.csv", index_col=0)
pol_panel = pd.read_csv("./data/panels/pol_panel.csv", index_col=0)

In [89]:
econ_panel.sha

Unnamed: 0,thinktank,year,pub,num_cites,expenses,age,con_lib
0,American Action Forum,2014,WSJ,3,4.878999,5,1
1,American Action Forum,2015,NYT,5,5.145811,6,1
2,American Action Forum,2015,WP,2,5.145811,6,1
3,American Action Forum,2015,WSJ,8,5.145811,6,1
4,American Action Forum,2016,NYT,6,5.351985,7,1
...,...,...,...,...,...,...,...
840,World Resources Institute,2016,NYT,1,89.823844,34,0
841,World Resources Institute,2016,WSJ,1,89.823844,34,0
842,World Resources Institute,2017,WP,1,94.564842,35,0
843,World Resources Institute,2018,NYT,2,104.038407,36,0


In [90]:
health_panel

Unnamed: 0,thinktank,year,pub,num_cites,expenses,age,con_lib
0,American Action Forum,2014,WSJ,1,4.878999,5,1
1,American Action Forum,2015,NYT,1,5.145811,6,1
2,American Action Forum,2015,WP,1,5.145811,6,1
3,American Action Forum,2017,WP,1,4.863097,8,1
4,American Action Forum,2018,NYT,1,3.942490,9,1
...,...,...,...,...,...,...,...
305,Woodrow Wilson International Center for Scholars,2018,WP,1,24.542658,50,0
306,World Resources Institute,2014,WSJ,1,65.763955,32,0
307,World Resources Institute,2015,NYT,1,77.539897,33,0
308,World Resources Institute,2016,NYT,1,89.823844,34,0


In [91]:
edu_panel

Unnamed: 0,thinktank,year,pub,num_cites,expenses,age,con_lib
0,American Action Forum,2014,NYT,1,4.878999,5,1
1,American Action Forum,2015,NYT,1,5.145811,6,1
2,American Action Forum,2015,WSJ,2,5.145811,6,1
3,American Action Forum,2016,NYT,1,5.351985,7,1
4,American Action Forum,2016,WP,3,5.351985,7,1
...,...,...,...,...,...,...,...
739,World Resources Institute,2015,NYT,1,77.539897,33,0
740,World Resources Institute,2015,WP,1,77.539897,33,0
741,World Resources Institute,2017,WSJ,2,94.564842,35,0
742,World Resources Institute,2018,NYT,1,104.038407,36,0


In [92]:
pol_panel

Unnamed: 0,thinktank,year,pub,num_cites,expenses,age,con_lib
0,American Action Forum,2014,NYT,1,4.878999,5,1
1,American Action Forum,2014,WP,2,4.878999,5,1
2,American Action Forum,2014,WSJ,7,4.878999,5,1
3,American Action Forum,2015,NYT,5,5.145811,6,1
4,American Action Forum,2015,WP,2,5.145811,6,1
...,...,...,...,...,...,...,...
858,World Resources Institute,2016,NYT,2,89.823844,34,0
859,World Resources Institute,2016,WP,1,89.823844,34,0
860,World Resources Institute,2017,NYT,2,94.564842,35,0
861,World Resources Institute,2017,WP,2,94.564842,35,0
