In [74]:
import pandas as pd
import locale
import numpy as np

In [75]:
df = pd.read_csv("./data/citations_data/thinktank_citations_tdm.csv", index_col=0)
tt_df = pd.read_csv("./data/thinktank_data/thinktank_features.csv", index_col=0)

In [76]:
df['year'] = df['date'].apply(lambda x: pd.to_datetime(x).year)

In [77]:
panel = df.groupby(['thinktank', 'year', 'pub'])[['title']].count().rename(columns={'title':'num_cites'}).reset_index()

In [78]:
df

Unnamed: 0,file_path,title,date,sentence,pub,thinktank,year
0,1906329834.xml,Trump Treads Into Feud Between Qatar and Saudis,2017-06-07,"""</p><p>Others analysts were more critical, sa...",NYT,Brookings Institution,2017
1,1928961341.xml,"Hartford, With Finances in Disarray, Veers Tow...",2017-08-16,"""</p><p>Connecticut has the greatest degree of...",NYT,Lincoln Institute of Land Policy,2017
2,1785134438.xml,Biden Makes a Surprise Visit to Iraq,2016-04-29,"""Iraqi leaders can't afford to lose that sense...",NYT,Atlantic Council,2016
3,1875295284.xml,Journalism or Propaganda? A Russian TV Network...,2017-03-09,"""</p><p>RT and Sputnik propel those campaigns ...",NYT,Atlantic Council,2017
4,1875295284.xml,Journalism or Propaganda? A Russian TV Network...,2017-03-09,"""I strongly suspect that RT Deutsch has a triv...",NYT,Council on Foreign Relations,2017
...,...,...,...,...,...,...,...
22720,1609088128.xml,And you think your schedule's tight . . .,2014-10-08,A 2010 Pew Research Center poll showed that on...,WP,Pew Research Center,2014
22721,1902589409.xml,"In Europe, Trump continues tout Saudi trip'suc...",2017-05-27,<html><head><meta name='ValidationSchema' cont...,WP,RMI,2017
22722,1902589409.xml,"In Europe, Trump continues tout Saudi trip'suc...",2017-05-27,He noted in particular the most expensive item...,WP,Middle East Institute,2017
22723,1732902263.xml,IAAF bans Russiantrack team,2015-11-14,Among the report's recommendations was a suspe...,WP,American Action Forum,2015


In [79]:
def add_budgets(tt, year):
    expenses_str = tt_df[(tt_df.thinktank==tt)][f'exp_{year}'].item()
    if pd.isna(expenses_str):
        return 0.0
    expenses = float(expenses_str.replace(",", "").split(".")[0])
    return expenses / 1000000

In [80]:
def add_age(tt, year):
    dob = tt_df[tt_df.thinktank==tt]['year_established'].item()
    return (year - dob)

In [81]:
def add_lib_or_cons(tt):
    cons = tt_df[tt_df.thinktank==tt]['conservative'].item()
    lib = tt_df[tt_df.thinktank==tt]['libertarian'].item()
    if not lib:
        return int(cons)
    else:
        return int(max(cons, lib))

In [82]:
panel['expenses'] = panel.apply(lambda x: add_budgets(x.thinktank, x.year), axis=1)
panel['age'] = panel.apply(lambda x: add_age(x.thinktank, x.year), axis=1)
panel['con_lib'] = panel.apply(lambda x: add_lib_or_cons(x.thinktank), axis=1)

In [83]:
str_to_float = lambda x: float(x.replace(",", "").split(".")[0]) / 1000000 if not pd.isna(x) else None
tt_df['exp_2014'] = tt_df['exp_2014'].apply(str_to_float)
tt_df['exp_2015'] = tt_df['exp_2015'].apply(str_to_float)
tt_df['exp_2016'] = tt_df['exp_2016'].apply(str_to_float)
tt_df['exp_2017'] = tt_df['exp_2017'].apply(str_to_float)
tt_df['exp_2018'] = tt_df['exp_2018'].apply(str_to_float)

In [84]:
for i in tt_df.columns:
    if 'exp' in i:
        print(i)
        mean = tt_df[i].mean()
        std = tt_df[i].std()
        count = tt_df[i].notna().sum()
        null = tt_df[i].isna().sum()
        mini, maxi = tt_df[i].min(), tt_df[i].max()
        print(mean, std, count, null, mini, maxi)

exp_2014
53.3569416631579 360.87658626806603 190 38 0.0 4870.339928
exp_2015
54.25438273298428 356.08665419705795 191 37 0.0 4809.182027
exp_2016
55.49112346874998 362.6929277560591 192 36 0.0 4910.674794
exp_2017
56.93500014062499 367.1928235791833 192 36 0.0 4957.938735
exp_2018
58.196963567708345 373.6410025040496 192 36 0.0 5047.673413


In [22]:
panel['is_2014'] = panel['year'].apply(lambda x: 1 if x==2014 else 0)
panel['is_2015'] = panel['year'].apply(lambda x: 1 if x==2015 else 0)
panel['is_2016'] = panel['year'].apply(lambda x: 1 if x==2016 else 0)
panel['is_2017'] = panel['year'].apply(lambda x: 1 if x==2017 else 0)
panel['is_2018'] = panel['year'].apply(lambda x: 1 if x==2018 else 0)

In [24]:
panel['event_14'] = panel['is_2014']*panel['con_lib']
panel['event_15'] = panel['is_2015']*panel['con_lib']
panel['event_16'] = panel['is_2016']*panel['con_lib']
panel['event_17'] = panel['is_2017']*panel['con_lib']
panel['event_18'] = panel['is_2018']*panel['con_lib']

In [63]:
x = sum([panel['event_14'], panel['event_15'], panel['event_16'], panel['event_17'], panel['event_18']])

In [61]:
mat = np.matrix([panel['event_14'], panel['event_15'], panel['event_16'], panel['event_17'], panel['event_18']])

In [56]:
np.corrcoef(mat)

array([[ 1.        , -0.04743397, -0.04645496, -0.04579286, -0.04199878],
       [-0.04743397,  1.        , -0.04957973, -0.04887308, -0.04482381],
       [-0.04645496, -0.04957973,  1.        , -0.04786437, -0.04389867],
       [-0.04579286, -0.04887308, -0.04786437,  1.        , -0.043273  ],
       [-0.04199878, -0.04482381, -0.04389867, -0.043273  ,  1.        ]])

In [49]:
np.linalg.matrix_rank(mat) == mat.shape[0]

False

In [39]:
np.corrcoef(panel['event_18'], panel['is_2018'])

array([[1.        , 0.41185236],
       [0.41185236, 1.        ]])

In [None]:
panel['event'] = panel['event']

In [21]:
#panel.to_csv("./data/master_panel.csv")