The purpose of this paper is to refine authors and sessions. 

Basically, i want to know how many papers, affs, etc are associated with each author. 

For sessions, i want to know how many papers are in that session. 

In [298]:
import pandas as pd 
import numpy as np 
from json import loads, dumps

## Papers

In [299]:
papers = pd.read_csv('../data/processed/papers.csv')
papers.columns = ['paper_id', 'title', 'type', 'abstract', 'number_of_authors', 'year', 'session', 'division', 'authors']
papers.drop(['authors'], inplace=True, axis = 1)
papers.head()

Unnamed: 0,paper_id,title,type,abstract,number_of_authors,year,session,division
0,2003-0001,Access to the Media Versus Access to Audiences...,Paper,When the issue of speakers' rights of access a...,1.0,2003,,
1,2003-0002,Accounting Episodes as Communicative Practice ...,Paper,In this paper I describe accounting episodes a...,1.0,2003,,
2,2003-0003,Accounts of Single-fatherhood: A case study,Paper,Abstract\nRelying on single-fathers accounts ...,4.0,2003,,
3,2003-0004,A Challenge to the Duel: Socializing Dedicated...,Paper,This paper explores the structural controls av...,1.0,2003,,
4,2003-0005,A chatroom ethnography: Evolution of community...,Paper,"In creating an ethnography about the City, Tex...",1.0,2003,,


In [300]:
paper_json = loads(papers.to_json(orient='records'))
paper_json[0]

{'paper_id': '2003-0001',
 'title': 'Access to the Media Versus Access to Audiences: The Distinction and its Implications for Media Regulation and Policy',
 'type': 'Paper',
 'abstract': 'When the issue of speakers\' rights of access arises in media regulation and policy contexts, the focus typically is on the concept of speakers\' rights of access "to the media," or "to the press." This right typically is premised on the audience\'s need for access to diverse sources and content. In contrast, in many non-mediated contexts, the concept of speakers\' rights of access frequently is defined in terms of the speaker\'s own First Amendment right of access to audiences. This paper explores the distinctions between these differing interpretations of a speaker\'s access rights and argues that the concept of a speaker\'s right of access to audiences merits a more prominent position in media regulation and policy. This paper then explores the implications of such a shift in perspective for media 

In [301]:
authors = pd.read_csv('../data/processed/authors.csv')
authors.head()

Unnamed: 0,Paper ID,Title,Number of Authors,Author Position,Author Name,Author Affiliation,Year
0,2003-0001,Access to the Media Versus Access to Audiences...,1,1,Philip Napoli,Fordham U,2003
1,2003-0002,Accounting Episodes as Communicative Practice ...,1,1,Mariko Kotani,Aoyama Gakuin University,2003
2,2003-0003,Accounts of Single-fatherhood: A case study,4,1,Tara M Emmers-Sommer,University of Arizona,2003
3,2003-0003,Accounts of Single-fatherhood: A case study,4,2,David Rhea,University of Arizona,2003
4,2003-0003,Accounts of Single-fatherhood: A case study,4,3,Laura Triplett,University of Arizona,2003


In [302]:
authorship_dict = {}
paperid_authors_dic = {}
# for every paper
for paper_id, group in authors.groupby('Paper ID'):
    paperid_authors_dic[paper_id] = list(group['Author Name'])
    authorships = []
    author_names = group['Author Name'].tolist()
    affs = group['Author Affiliation'].tolist()
    for i, author_name in enumerate(author_names):
        dic = {}
        dic['position'] = i
        dic['author_name'] = author_name 
        dic['author_affiliation'] = affs[i]
        authorships.append(dic)
    authorship_dict[paper_id] = authorships


In [303]:
authorship_dict['2003-0001']

[{'position': 0,
  'author_name': 'Philip Napoli',
  'author_affiliation': 'Fordham U'}]

In [304]:
for paper_dic in paper_json:
    try:
        paper_dic['authorships'] = authorship_dict[paper_dic['paper_id']]
        paper_dic['author_names'] = paperid_authors_dic[paper_dic['paper_id']]
    except:
        print(paper_dic)

{'paper_id': '2005-0806', 'title': 'Interpersonal and Intrapersonal Motives to Acquire Information from Mediated Messages', 'type': 'Paper', 'abstract': 'The present investigation explores the influences of interpersonal (intrinsic) and intrapersonal (extrinsic) motives on information acquisition from mediated messages, as well as the influences these motives may have on each other. Intrinsic and extrinsic motives were operationalized as personal interest and expectations of future relevant discussion, respectively. Respondents received a manipulation that elevated the expectation of discussing certain topics with unknown students and then viewed a newscast featuring these topics. Personal interest in and information acquisition of each message were assessed, along with anticipations of topical discussion with friends or family. Results showed that intrinsic and extrinsic interests related positively to information acquisition indicators for the relevant news stories. In addition, extr

In [305]:
paper_json[2000]

{'paper_id': '2004-1007',
 'title': 'Traveling Through Borders: SARS Coverage in a Chinese-Language Newspaper in the U.S.',
 'type': 'Paper',
 'abstract': 'A public crisis like SARS poses an immediate and serious threat to the lives and property and to the peace of mind of many. As the crisis unfolds, the mass media are called upon to keep both public officials and the general population abreast of the situation. This study focuses on news coverage of SARS, a public health crisis that had an enormous global impact. Drawing upon Media Dependency System theory, we examined a Chinese-language newspaper, a well read publication in the Chinese community in the U.S., with concerned readers about the happenings in their home countries where SARS originated. The findings suggested that the paper cited less health-related sources than health-related sources when reporting SARS stories. In addition, more government organizations were cited than regular individuals. There were only few stories th

## Author aggregated

In [306]:
dicts = []
for author_name, group in authors.groupby('Author Name'):
    # sort by year to make sure affs are in temporal order 
    group.sort_values('Year', ascending=True, inplace=True)
    paper_ids = group['Paper ID'].unique()
    affs = group['Author Affiliation'].unique()
    years = group['Year'].unique()
    dic = {
        'Author Name': author_name,
        'Year Count': len(years),
        'Affiliation Count': len(affs),
        'Paper Count': len(paper_ids),
        'Paper IDs': ", ".join(map(str, paper_ids)),
        'Years': ", ".join(map(str, years)),
        'Affiliations': " -> ".join(map(str, affs))
    }
    dicts.append(dic)

In [307]:
dicts[0:2]

[{'Author Name': '"Kathy" Feng Li',
  'Year Count': 1,
  'Affiliation Count': 1,
  'Paper Count': 1,
  'Paper IDs': '2005-1018',
  'Years': '2005',
  'Affiliations': 'U of Houston'},
 {'Author Name': '. Sunwolf',
  'Year Count': 1,
  'Affiliation Count': 1,
  'Paper Count': 1,
  'Paper IDs': '2011-0102',
  'Years': '2011',
  'Affiliations': 'Santa Clara U'}]

In [308]:
pd.DataFrame(dicts[0:100]).sort_values(by='Year Count', ascending=False)

Unnamed: 0,Author Name,Year Count,Affiliation Count,Paper Count,Paper IDs,Years,Affiliations
28,Aaron Shaw,7,5,12,"2011-0152, 2011-0393, 2012-1663, 2013-1676, 20...","2011, 2012, 2013, 2014, 2016, 2017, 2018","University of California, Berkeley -> U of Cal..."
88,Adam J. Saffer,6,5,11,"2012-0498, 2013-1492, 2015-0066, 2015-0067, 20...","2012, 2013, 2015, 2016, 2017, 2018",University of Oklahoma -> U of Oklahoma -> U o...
16,Aaron Castelan Cargile,5,1,8,"2007-0024, 2008-0887, 2008-1712, 2012-1021, 20...","2007, 2008, 2012, 2015, 2017",California State U
26,Aaron S. Veenstra,5,4,8,"2008-0380, 2010-0221, 2010-0993, 2013-0327, 20...","2008, 2010, 2013, 2015, 2018",U of Wisconsin - Madison -> Southern Illinois ...
6,A Susana Ramirez,4,3,4,"2008-0191, 2009-0744, 2011-0530, 2014-0930","2008, 2009, 2011, 2014",U of Pennsylvania -> Harvard University -> Uni...
...,...,...,...,...,...,...,...
39,Abby Leigh Prestin,1,1,1,2009-0491,2009,U of California - Santa Barbara
38,Abby Gail LeGrange,1,1,1,2007-0601,2007,U of Florida
36,Abbey Levenshus,1,1,1,2008-1301,2008,American U
34,Abbe S Depretis,1,1,1,2013-0333,2013,Christopher Newport U


## Work on sessions

In [309]:
# papers = pd.read_csv('../data/processed/papers.csv')
# papers.head()

In [310]:
# dic = {}
# for session, group in papers.dropna(subset=['Session']).groupby('Session'):
#     dic[session] = len(group)

In [311]:
# len(papers.Session.unique())

In [312]:
# dic['So Far, So Good, Part 2: Organizational Communication Research Escalator']

In [313]:
sessions = pd.read_csv('../data/processed/sessions.csv')
sessions.head()

Unnamed: 0,Year,Session Type,Session Title,Division/Unit,Chair Name,Chair Affiliation
0,2014,Paper Session,Meda Coverage of Health Issues,Health Communication,Xiaoli Nan,U of Maryland
1,2014,Paper Session,Cognition and Health,Health Communication,Seth M. Noar,U of North Carolina
2,2014,Paper Session,Changing the News 140 Characters At a Time: Tw...,Journalism Studies,Seth C. Lewis,U of Minnesota
3,2014,Paper Session,Media and Political Contestation in Greater China,Global Communication and Social Change,Guobin Yang,University of Pennsylvania
4,2014,Paper Session,Between Science and the Public: Studies in Sci...,Journalism Studies,Henrik Ornebring,Karlstad U


In [314]:
sessions.shape

(1859, 6)

So, clearly some sessions in papers are not present in the dataset of `sessions`. Let's address this. 

In [315]:
# sessions['Paper Count'] = sessions['Session Title'].apply(lambda x : dic[x])

## Add to papers

First, let's convert sessions to a dic and add info to papers. 

In [316]:
session_dic = {}
for session, group in sessions.groupby('Session Title'):
    dic = {}
    dic['session'] = session
    dic['session_type'] = group['Session Type'].tolist()[0]
    dic['chair_name'] = group['Chair Name'].tolist()[0]
    dic['chair_affiliation'] = group['Chair Affiliation'].tolist()[0]
    dic['division'] = group['Division/Unit'].tolist()[0]
    session_dic[session] = dic 

In [317]:
# session_dic

In [318]:
session_dic['Sports Communication Interactive Poster Session']

{'session': 'Sports Communication Interactive Poster Session',
 'session_type': 'Interactive Paper Session',
 'chair_name': nan,
 'chair_affiliation': nan,
 'division': 'In Event: ICA Plenary Interactive Paper/Poster Session II'}

In [319]:
for paper_dic in paper_json:
    try:
        paper_dic['session_info'] = session_dic[paper_dic['session']]
    except:
        pass

In [320]:
paper_json[-1]

{'paper_id': '2018-0255',
 'title': 'The Impact of Presenting Physiological Data During Sporting Events on Audiences Entertainment',
 'type': 'Poster',
 'abstract': 'Psychophysiological data has been useful in many domains and this study examines the use of such information in the domain of sports audiences. This study employs a four condition experiment in which participants watched a short sports clip displaying different physiological measures in the corner. The participants were then asked about their perceptions of the clip. Broadly, there was not much difference between groups based on the types of information presented, however, presenting blood pressure information proved to be the most entertaining for audiences. This provides early evidence that the presentation of physiological information during a sporting event can impact feelings of enjoyment, meaningfulness, and perceptions of knowledge of the sport. There is promise for these measures to be used in sports media provided

## Aggregated Session data

- division
- chair_name
- chair_affiliation
- type
- paper_count
- years

In [321]:
papers.head()

Unnamed: 0,paper_id,title,type,abstract,number_of_authors,year,session,division
0,2003-0001,Access to the Media Versus Access to Audiences...,Paper,When the issue of speakers' rights of access a...,1.0,2003,,
1,2003-0002,Accounting Episodes as Communicative Practice ...,Paper,In this paper I describe accounting episodes a...,1.0,2003,,
2,2003-0003,Accounts of Single-fatherhood: A case study,Paper,Abstract\nRelying on single-fathers accounts ...,4.0,2003,,
3,2003-0004,A Challenge to the Duel: Socializing Dedicated...,Paper,This paper explores the structural controls av...,1.0,2003,,
4,2003-0005,A chatroom ethnography: Evolution of community...,Paper,"In creating an ethnography about the City, Tex...",1.0,2003,,


In [322]:
papers[papers.session=='"...And Communications for All:" A Policy Agenda for the New Administration']

Unnamed: 0,paper_id,title,type,abstract,number_of_authors,year,session,division
8118,2009-0069,Americas Forgotten Challenge: Rural Access,Session Paper,Roughly 17 percent of the U.S. population live...,1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy
8368,2009-0319,Creating a Media Policy Agenda for the Digital...,Session Paper,The rapid growth of the Internet and prolifera...,1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy
8429,2009-0380,"Digital Media, Modern Democracy, and Our Trunc...",Session Paper,Policy makers and regulators in the new admini...,1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy
8814,2009-0765,International Benchmarks: The Crisis in U.S. C...,Session Paper,"This presentation, will set out by describing ...",1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy
9258,2009-1209,Public Scholarship and the Communications Poli...,Session Paper,Communications has been at the heart of our sp...,1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy
9320,2009-1271,Rethinking the Media Ownership Policy Agenda,Session Paper,Media ownership is a policy issue with a long ...,1.0,2009,"""...And Communications for All:"" A Policy Agen...",Communication Law & Policy


In [323]:
'"...And Communications for All:" A Policy Agenda for the New Administration' in session_dic

False

In [324]:
for session, group in papers.groupby('session'):
    # groupby excludes rows with nan values
    if session in session_dic:
        session_dic[session]['years'] = list(group.year.unique())
        session_dic[session]['paper_count'] = len(group)
    else:
        dic = {}
        dic['session'] = session
        dic['years'] = list(group.year.unique())
        dic['paper_count'] = len(group)
        dic['session_type'] = np.nan 
        dic['chair_name'] = np.nan 
        dic['chair_affiliation'] = np.nan
        try:
            dic['division'] = group.division
        except:
            dic['division'] = np.nan
        session_dic[session] = dic

In [327]:
list(session_dic.values())

[{'session': '"Let\'s Research It All!" New Approaches for Video Games and Their Effects',
  'session_type': 'Paper Session',
  'chair_name': 'Johannes Breuer',
  'chair_affiliation': 'GESIS – Leibniz-Institute for the Social Sciences',
  'division': 'Game Studies',
  'years': [2017],
  'paper_count': 5},
 {'session': '#SocialSports: Digital Media Technologies and Sports Communication',
  'session_type': 'Paper Session',
  'chair_name': nan,
  'chair_affiliation': nan,
  'division': 'Sports Communication',
  'years': [2017],
  'paper_count': 4},
 {'session': "(Don't) Be So Emotional: Athletes, Professors, and Other Publics",
  'session_type': 'Paper Session',
  'chair_name': 'Vilma L. Luoma-aho',
  'chair_affiliation': 'University of Jyvaskyla',
  'division': 'Public Relations',
  'years': [2017],
  'paper_count': 5},
 {'session': '@Journalists on #Twitter',
  'session_type': 'Paper Session',
  'chair_name': 'Shannon C McGregor',
  'chair_affiliation': 'University of Utah',
  'division