This is to debug `generate_json_and_aggreated_data.py`.

In [28]:
import pandas as pd 
import json 
import numpy as np 

def process_papers_df(PAPERS_DF):
    papers = pd.read_csv(PAPERS_DF)
    papers.columns = ['paper_id', 'title', 'paper_type', 'abstract', 
                    'number_of_authors', 'year', 'session', 'division', 'authors']
    papers.drop(['authors'], inplace=True, axis = 1)
    return papers 

def get_authorships_dic_and_paperid_authors_dic(AUTHORS_DF):
    authors = pd.read_csv(AUTHORS_DF)
    # look like this:
    """
    [{'position': 0, 'author_name': 'Åsa Kroon', 'author_affiliation': 'Örebro U'},
    {'position': 1,
    'author_name': 'Mats Erik Ekstrom',
    'author_affiliation': 'Orebro U'}]
    """
    authorships_dic = {}
    # this is easy to understand. key is paper_id, value is list of authors
    paperid_authors_dic = {}
    # for every paper
    for paper_id, group in authors.groupby('Paper ID'):
        paperid_authors_dic[paper_id] = list(group['Author Name'])
        authorships = []
        author_names = group['Author Name'].tolist()
        affs = group['Author Affiliation'].tolist()
        for i, author_name in enumerate(author_names):
            dic = {}
            dic['position'] = i
            dic['author_name'] = author_name 
            dic['author_affiliation'] = affs[i]
            authorships.append(dic)
        authorships_dic[paper_id] = authorships
    return authorships_dic, paperid_authors_dic

def get_session_dic(SESSIONS_DF):
    """
    looks like this:

    {'session': 'Sports Communication Interactive Poster Session',
    'session_type': 'Interactive Paper Session',
    'chair_name': nan,
    'chair_affiliation': nan,
    'division': 'In Event: ICA Plenary Interactive Paper/Poster Session II'}
    """
    sessions = pd.read_csv(SESSIONS_DF)
    session_dic = {}
    for session, group in sessions.groupby('Session Title'):
        dic = {}
        dic['session'] = session
        dic['session_type'] = group['Session Type'].tolist()[0]
        dic['chair_name'] = group['Chair Name'].tolist()[0]
        dic['chair_affiliation'] = group['Chair Affiliation'].tolist()[0]
        dic['division'] = group['Division/Unit'].tolist()[0]
        session_dic[session] = dic 
    return session_dic

def update_papers_json(
        papers_json_raw, authorships_dic, paperid_authors_dic, session_dic):
    for paper_dic in papers_json_raw:
        try:
            paper_dic['authorships'] = authorships_dic[paper_dic['paper_id']]
        except:
            paper_dic['authorships'] = None
        try:
            paper_dic['author_names'] = paperid_authors_dic[paper_dic['paper_id']]
        except:
            paper_dic['author_names'] = None 
        try:
            paper_dic['session_info'] = session_dic[paper_dic['session']]
        except:
            paper_dic['session_info'] = None

def get_sessions_json(papers, session_dic):
    for session, group in papers.groupby('session'):
    # groupby excludes rows with nan values
        if session in session_dic:
            session_dic[session]['years'] = [int(year) for year in group['year'].unique()]
            session_dic[session]['paper_count'] = len(group)
        else:
            dic = {}
            dic['session'] = session
            dic['years'] = [int(year) for year in group['year'].unique()]
            dic['paper_count'] = len(group)
            dic['session_type'] = None 
            dic['chair_name'] = None 
            dic['chair_affiliation'] = None
            try:
                dic['division'] = group.division.unique()[0]
            except:
                dic['division'] = None
            session_dic[session] = dic
    sessions_json = list(session_dic.values())
    return sessions_json

def get_authors_json(AUTHORS_DF):
    authors = pd.read_csv(AUTHORS_DF)
    authors_json = []
    for author_name, group in authors.groupby('Author Name'):
        # sort by year to make sure affs are in temporal order 
        group = group.sort_values('Year', ascending=True)
        paper_ids = list(group['Paper ID'].unique())
        affs = group['Author Affiliation'].dropna().unique()
        years = [int(year) for year in group['Year'].unique()]
        dic = {
            'author_name': author_name,
            'attend_count': int(len(years)),
            'paper_count': int(len(paper_ids)),
            'paper_ids': paper_ids,
            'affiliation': " -> ".join(map(str, affs)),
            'years_attended': years,
        }
        authors_json.append(dic)
    # sort by attend_count, descending
    return sorted(authors_json, key=lambda x: x['attend_count'], reverse=True)

In [29]:
PAPERS_DF = '../data/processed/papers.csv'
AUTHORS_DF = '../data/processed/authors.csv'
SESSIONS_DF = '../data/processed/sessions.csv'

In [30]:
papers = process_papers_df(PAPERS_DF)
paper_ids = papers.paper_id.unique()
papers_json_raw = json.loads(papers.to_json(orient='records'))
authorships_dic, paperid_authors_dic = get_authorships_dic_and_paperid_authors_dic(
    AUTHORS_DF
)
session_dic = get_session_dic(SESSIONS_DF)
update_papers_json(
    papers_json_raw, 
    authorships_dic, 
    paperid_authors_dic, 
    session_dic
)

In [31]:
papers_json_raw[0]

{'paper_id': '2003-0001',
 'title': 'Access to the Media Versus Access to Audiences: The Distinction and its Implications for Media Regulation and Policy',
 'paper_type': 'Paper',
 'abstract': 'When the issue of speakers\' rights of access arises in media regulation and policy contexts, the focus typically is on the concept of speakers\' rights of access "to the media," or "to the press." This right typically is premised on the audience\'s need for access to diverse sources and content. In contrast, in many non-mediated contexts, the concept of speakers\' rights of access frequently is defined in terms of the speaker\'s own First Amendment right of access to audiences. This paper explores the distinctions between these differing interpretations of a speaker\'s access rights and argues that the concept of a speaker\'s right of access to audiences merits a more prominent position in media regulation and policy. This paper then explores the implications of such a shift in perspective for 

In [32]:
with open('papers.json', 'w') as f:
    json.dump(papers_json_raw, f, indent=2)

In [33]:
sessions_json_raw = get_sessions_json(papers, session_dic)
authors_json_raw = get_authors_json(AUTHORS_DF)

In [34]:
with open('authors.json', 'w') as f:
    json.dump(authors_json_raw, f, indent=2)

In [35]:
with open('sessions.json', 'w') as f:
    json.dump(sessions_json_raw, f, indent=2)