In [1]:
import pandas as pd
import numpy as np 
import hashlib

In [14]:
paper_df = pd.read_csv('../data/processed/papers.csv')
paper_df.head()

Unnamed: 0,Paper ID,Title,Paper Type,Abstract,Number of Authors,Year,Session,Division/Unit,Authors
0,2005-f7d45923ae07535d9fad99cb83202ce8,Access to the Media Versus Access to Audiences...,Paper,When the issue of speakers' rights of access a...,1.0,2003,,,Philip Napoli
1,2005-240e64de7a1755e39383e4bcedf22f0f,Accounting Episodes as Communicative Practice ...,Paper,In this paper I describe accounting episodes a...,1.0,2003,,,Mariko Kotani
2,2005-b3e1ebea1caf5635bec027c5ecaf2593,Accounts of Single-fatherhood: A case study,Paper,Abstract\nRelying on single-fathers accounts ...,4.0,2003,,,"Tara M Emmers-Sommer, David Rhea, Laura Triple..."
3,2005-d03321a652e75b008b3ccadf1de2f1e3,A Challenge to the Duel: Socializing Dedicated...,Paper,This paper explores the structural controls av...,1.0,2003,,,Chad Tew
4,2005-04f89d86e5ad5696860f94337dcecd7c,A chatroom ethnography: Evolution of community...,Paper,"In creating an ethnography about the City, Tex...",1.0,2003,,,Audra Diers


In [17]:
duplicates = paper_df[paper_df.duplicated(subset='Paper ID', keep=False)].sort_values('Paper ID')
duplicates

Unnamed: 0,Paper ID,Title,Paper Type,Abstract,Number of Authors,Year,Session,Division/Unit,Authors
18688,2005-00473bbcecb25617bca35edec167f57a,Virtually Rich: Conspicuous Consumption in a V...,Paper Session,"In ""The Theory of the Leisure Class"" Thomas Ve...",1.0,2014,Visual Variables in Communication Technologies...,Communication and Technology,Joshua Andrew Clark
2753,2005-00473bbcecb25617bca35edec167f57a,Narratives of the SARS Epidemic and Ethical Im...,Paper,The SARS (Severe Acute Respiratory Syndrome) e...,4.0,2005,Public Health Crisis Response,Public Relations,"Robert L. Heath, Shannon A. Bowen, Jaesub Lee,..."
660,2005-00473bbcecb25617bca35edec167f57a,Resonance as Mediator of Prime-Time Messages a...,Paper,This study examined attitude change caused by ...,2.0,2003,,,"Martha M. Lauzen, David M. Dozier"
26464,2005-00473bbcecb25617bca35edec167f57a,Viewer Responses to YouTube Product Reviews: T...,Poster,This study examined whether sponsorship disclo...,3.0,2016,Important Media and Message Features,Information Systems,"Emese Domahidi, Johannes Breuer, Rachel Kowert..."
26077,2005-004858412a405106b71669e223a1a9b1,The Antisocial Net. Modeling the Influence of ...,Poster,Negative perceptions of media technology in a ...,2.0,2014,Online Media II: Analyses of Communication Net...,Information Systems,"Jennifer Petersen, Mathias Weber, Marc Ziegele"
...,...,...,...,...,...,...,...,...,...
8754,2009-ffb30524fb01534e9b6d2b6d9ab1594d,ICT Solutions for Increasing Social Capital Am...,Paper,The flight of rural youth to suburban and urba...,3.0,2009,Building Capital and Bridging Community With ICTs,Communication and Technology,"Pamela S. Whitten, Andrew Smock, Kurt DeMaagd"
9858,2009-ffbb471f8fc95cfb9d04ac0f5913d393,Urban Communication and the Transformation of ...,Session Paper,Vital social life once offered by an urban env...,2.0,2009,Keyword: Urban Communication,Theme Sessions,"Susan Drucker, Gary Gumpert"
25793,2009-ffbb471f8fc95cfb9d04ac0f5913d393,"The Internet of Things, Datafication and the F...",Paper Session,The year 2030 seems to be beckoning a fair amo...,4.0,2018,"Smart Stuff: Internet of Things, Smart Cities,...",Communication and Technology,"Lucia Vesnic-Alujevic, Gilda Seddighi, Ranjana..."
24093,2009-ffec33721a7857bf84fa0fd6ac9215d0,Reconsidering Partisanship as a Constraint on ...,Paper Session,Partisan motivated reasoning is often forwarde...,5.0,2018,Potpourri of Research on Campaign Communication,Political Communication,"Benjamin Ryan Warner, Mitchell S. McKinney, Fr..."


In [13]:
paper_df[paper_df['Paper ID'].isnull()]

Unnamed: 0,Paper ID,Title,Paper Type,Abstract,Number of Authors,Year,Session,Division/Unit,Authors


In [12]:
len(paper_df['Paper ID'].unique()), paper_df.shape[0]

(15935, 27466)

In [3]:
author_df = pd.read_csv('../data/processed/authors.csv')

In [4]:
paper_df_ids = set(paper_df['Paper ID'])
author_df_ids = set(author_df['Paper ID'])
[x for x in author_df_ids if x not in paper_df_ids]

[]

In [5]:
# generate new paper id based on title_paper_id
def generate_unique_id(title_paper_id):
	return hashlib.md5(str(title_paper_id).encode()).hexdigest()[:18]

def update_paper_df_and_get_id_dic(paper_df):
	"""
	The purpose of this function is to update Paper ID because
	The old paper_ids are not unique.

	We update the id and get a dict where key is old_paper_id-title and value is the
	new paper_id (str)
	"""
	# change col name
	paper_df.rename(columns={'Paper ID': 'Old Paper ID'}, inplace=True)

	# check whether title_paper_ids are truely unique
	paper_df['Title_Old_Paper_ID'] = paper_df.apply(
		lambda row: row['Title'] + "_" + str(row['Old Paper ID']), axis = 1)
	if len(paper_df['Title_Old_Paper_ID'].unique()) == paper_df.shape[0]:
		print("Title_Old_Paper_ID is truely unique")
	else:
		print("Even Title_Old_Paper_ID is not truely unique")

	paper_df['Paper ID'] = ''

	for year, group in paper_df.groupby('Year'):
		for idx, row in group.iterrows():
			title_paper_id = row['Title'] + "_" + str(row['Old Paper ID'])
			paper_df.at[idx, 'Paper ID'] = f"{year}-{generate_unique_id(title_paper_id)}"

	# check whether the new paper ids are unique or not
	if len(paper_df['Paper ID'].unique()) == paper_df.shape[0]:
		print("The new Paper IDs are truely unique")
	else:
		print("The new Paper IDs are not truely unique")
		print(len(paper_df['Paper ID'].unique()))
		print(paper_df.shape[0])

	# generate the dic 
	id_dic = dict(zip(paper_df['Title_Old_Paper_ID'], paper_df['Paper ID']))

	paper_df.drop(columns=['Title_Old_Paper_ID', 'Old Paper ID'], inplace=True)

	paper_df = paper_df[['Paper ID'] + [
		col for col in paper_df.columns if col != 'Paper ID']]
	
	return paper_df, id_dic

In [6]:
paper_df, id_dic = update_paper_df_and_get_id_dic(paper_df)

Title_Old_Paper_ID is truely unique
The new Paper IDs are truely unique


In [120]:
paper_df = pd.read_csv('../data/processed/papers.csv')

# change col name
paper_df.rename(columns={'Paper ID': 'Old Paper ID'}, inplace=True)

# check whether title_paper_ids are truely unique
paper_df['Title_Old_Paper_ID'] = paper_df.apply(
    lambda row: row['Title'] + "_" + str(row['Old Paper ID']), axis = 1)
if len(paper_df['Title_Old_Paper_ID'].unique()) == paper_df.shape[0]:
    print("Title_Old_Paper_ID is truely unique")
else:
    print("Even Title_Old_Paper_ID is not truely unique")

paper_df['Paper ID'] = ''

# generate new paper id based on title_paper_id
def generate_unique_id(title_paper_id):
    return hashlib.md5(str(title_paper_id).encode()).hexdigest()[:14]
for year, group in paper_df.groupby('Year'):
    for idx, row in group.iterrows():
        title_paper_id = row['Title'] + "_" + str(row['Old Paper ID'])
        paper_df.at[idx, 'Paper ID'] = f"{year}-{generate_unique_id(title_paper_id)}"

# check whether the new paper ids are unique or not
if len(paper_df['Paper ID'].unique()) == paper_df.shape[0]:
    print("The new Paper IDs are truely unique")
else:
    print("The new Paper IDs are not truely unique")

# generate the dic 
id_dic = dict(zip(paper_df['Title_Old_Paper_ID'], paper_df['Paper ID']))

paper_df.drop(columns=['Title_Old_Paper_ID', 'Old Paper ID'], inplace=True)

paper_df = paper_df[['Paper ID'] + [
    col for col in paper_df.columns if col != 'Paper ID']]

Title_Old_Paper_ID is truely unique
The new Paper IDs are truely unique


In [121]:
paper_df.head()

Unnamed: 0,Paper ID,Title,Paper Type,Abstract,Number of Authors,Year,Session,Division/Unit,Authors
0,2003-8726c2764aa326,Access to the Media Versus Access to Audiences...,Paper,When the issue of speakers' rights of access a...,1.0,2003,,,Philip Napoli
1,2003-db591dafbda67c,Accounting Episodes as Communicative Practice ...,Paper,In this paper I describe accounting episodes a...,1.0,2003,,,Mariko Kotani
2,2003-2bcf7a79a3b7a2,Accounts of Single-fatherhood: A case study,Paper,Abstract\nRelying on single-fathers accounts ...,4.0,2003,,,"Tara M Emmers-Sommer, David Rhea, Laura Triple..."
3,2003-37466036e4e533,A Challenge to the Duel: Socializing Dedicated...,Paper,This paper explores the structural controls av...,1.0,2003,,,Chad Tew
4,2003-187b065317e6cc,A chatroom ethnography: Evolution of community...,Paper,"In creating an ethnography about the City, Tex...",1.0,2003,,,Audra Diers


In [122]:
id_dic

{'Access to the Media Versus Access to Audiences: The Distinction and its Implications for Media Regulation and Policy_2003-0001': '2003-8726c2764aa326',
 'Accounting Episodes as Communicative Practice Affecting Cultural Knowledge_2003-0002': '2003-db591dafbda67c',
 'Accounts of Single-fatherhood: A case study_2003-0003': '2003-2bcf7a79a3b7a2',
 'A Challenge to the Duel: Socializing Dedicated Virtual Reality Fans to the Ideology of Textualism_2003-0004': '2003-37466036e4e533',
 'A chatroom ethnography: Evolution of community, norms, nonverbal communication_2003-0005': '2003-187b065317e6cc',
 'A Communicative Approach to Road Rage: Accounts of Driving and Retaliation_2003-0006': '2003-19db381d5398b8',
 'A Content Analysis of Direct Marketing Emails_2003-0007': '2003-f96e1ef0fc5b77',
 'A content analysis of news coverage of skin cancer prevention and detection, 1979-2002_2003-0008': '2003-469e2ec979ce4a',
 'A cross-cultural comparison of the relationship between ICA, ICMS and assertivene

In [123]:
author_df = pd.read_csv('../data/processed/authors.csv')
author_df.head()

Unnamed: 0,Paper ID,Title,Number of Authors,Author Position,Author Name,Author Affiliation,Year
0,2003-0001,Access to the Media Versus Access to Audiences...,1,1,Philip Napoli,Fordham U,2003
1,2003-0002,Accounting Episodes as Communicative Practice ...,1,1,Mariko Kotani,Aoyama Gakuin University,2003
2,2003-0003,Accounts of Single-fatherhood: A case study,4,1,Tara M Emmers-Sommer,University of Arizona,2003
3,2003-0003,Accounts of Single-fatherhood: A case study,4,2,David Rhea,University of Arizona,2003
4,2003-0003,Accounts of Single-fatherhood: A case study,4,3,Laura Triplett,University of Arizona,2003


In [140]:
author_df = pd.read_csv('../data/processed/authors.csv')
author_df.rename(columns={'Paper ID': 'Old Paper ID'}, inplace=True)

# Create 'Title_Old_Paper_ID' only if both columns exist
author_df['Title_Old_Paper_ID'] = author_df.apply(
    lambda row: row['Title'] + "_" + str(row['Old Paper ID']), axis=1
)

author_df['Paper ID'] = author_df['Title_Old_Paper_ID'].map(id_dic)

author_df.drop(columns=['Title_Old_Paper_ID', 'Old Paper ID'], inplace=True)

author_df = author_df[['Paper ID'] + [
    col for col in author_df.columns if col != 'Paper ID']]

In [141]:
author_df.head()

Unnamed: 0,Paper ID,Title,Number of Authors,Author Position,Author Name,Author Affiliation,Year
0,2003-8726c2764aa326,Access to the Media Versus Access to Audiences...,1,1,Philip Napoli,Fordham U,2003
1,2003-db591dafbda67c,Accounting Episodes as Communicative Practice ...,1,1,Mariko Kotani,Aoyama Gakuin University,2003
2,2003-2bcf7a79a3b7a2,Accounts of Single-fatherhood: A case study,4,1,Tara M Emmers-Sommer,University of Arizona,2003
3,2003-2bcf7a79a3b7a2,Accounts of Single-fatherhood: A case study,4,2,David Rhea,University of Arizona,2003
4,2003-2bcf7a79a3b7a2,Accounts of Single-fatherhood: A case study,4,3,Laura Triplett,University of Arizona,2003


In [126]:
author_df

Unnamed: 0,Old Paper ID,Title,Number of Authors,Author Position,Author Name,Author Affiliation,Year,Title_Old_Paper_ID
0,2003-0001,Access to the Media Versus Access to Audiences...,1,1,Philip Napoli,Fordham U,2003,Access to the Media Versus Access to Audiences...
1,2003-0002,Accounting Episodes as Communicative Practice ...,1,1,Mariko Kotani,Aoyama Gakuin University,2003,Accounting Episodes as Communicative Practice ...
2,2003-0003,Accounts of Single-fatherhood: A case study,4,1,Tara M Emmers-Sommer,University of Arizona,2003,Accounts of Single-fatherhood: A case study_20...
3,2003-0003,Accounts of Single-fatherhood: A case study,4,2,David Rhea,University of Arizona,2003,Accounts of Single-fatherhood: A case study_20...
4,2003-0003,Accounts of Single-fatherhood: A case study,4,3,Laura Triplett,University of Arizona,2003,Accounts of Single-fatherhood: A case study_20...
...,...,...,...,...,...,...,...,...
54311,2018-0252,Are Sports Themed Campaigns Constructing New F...,1,1,Kulveen Trehan,GGIP University,2018,Are Sports Themed Campaigns Constructing New F...
54312,2018-0253,Mediatization and Sport: A bottom-Up Perspective,1,1,m skey,Loughborough University,2018,Mediatization and Sport: A bottom-Up Perspecti...
54313,2018-0254,Monstrosities and Metaphors in Fantasy Football,2,1,Ailesha Ringer,University of New Mexico,2018,Monstrosities and Metaphors in Fantasy Footbal...
54314,2018-0254,Monstrosities and Metaphors in Fantasy Football,2,2,Arthur Aguirre,U of Texas at El Paso,2018,Monstrosities and Metaphors in Fantasy Footbal...


In [75]:
len(df['Paper ID'].unique()) == df.shape[0]

False

We can see that paper_id is not unique. 

In [76]:
years_with_duplicated_ids = set()
for paper_id, group in df.groupby('Paper ID'):
    if len(group) > 1:
        years_with_duplicated_ids.add(group.Year.tolist()[0])

In [77]:
years_with_duplicated_ids

{2014, 2015, 2016, 2017, 2018}

The above years contian duplicated ids. 

In [78]:
for paper_id, group in df.groupby('Paper ID'):
    if len(group) > 1:
        print(group)
        break

        Paper ID                                              Title  \
18028  2014-0000  Exploring Japanese Media’s Health Coverage in ...   
25804  2014-0000  Against Policy Failure: Designing a Media Syst...   

          Paper Type                                           Abstract  \
18028  Paper Session  Media system dependency theory suggests that i...   
25804         Poster  Any conception of the “good life” would likely...   

       Number of Authors  Year  \
18028                3.0  2014   
25804                1.0  2014   

                                                 Session  \
18028                     Meda Coverage of Health Issues   
25804  Extended Session: Communication Law, Policy, a...   

                    Division/Unit  \
18028        Health Communication   
25804  Communication Law & Policy   

                                                 Authors  
18028  Holley A. Wilkin, Michael Adam Tannebaum, Joo-...  
25804  Holley A. Wilkin, Michael Adam Tannebau

In [79]:
import hashlib

In [82]:
def generate_unique_id(title_paper_id):
    return hashlib.md5(str(title_paper_id).encode()).hexdigest()[:14]

In [83]:
for year, group in df.groupby('Year'):
    for idx, row in group.iterrows():
        title_paper_id = row['Title'] + "_" + str(row['Paper ID'])
        df.at[idx, 'Paper New ID'] = f"{year}-{generate_unique_id(title_paper_id)}"

In [84]:
df.head()

Unnamed: 0,Paper ID,Title,Paper Type,Abstract,Number of Authors,Year,Session,Division/Unit,Authors,Paper New ID
0,2003-0001,Access to the Media Versus Access to Audiences...,Paper,When the issue of speakers' rights of access a...,1.0,2003,,,Philip Napoli,2003-8726c2764aa326
1,2003-0002,Accounting Episodes as Communicative Practice ...,Paper,In this paper I describe accounting episodes a...,1.0,2003,,,Mariko Kotani,2003-db591dafbda67c
2,2003-0003,Accounts of Single-fatherhood: A case study,Paper,Abstract\nRelying on single-fathers accounts ...,4.0,2003,,,"Tara M Emmers-Sommer, David Rhea, Laura Triple...",2003-2bcf7a79a3b7a2
3,2003-0004,A Challenge to the Duel: Socializing Dedicated...,Paper,This paper explores the structural controls av...,1.0,2003,,,Chad Tew,2003-37466036e4e533
4,2003-0005,A chatroom ethnography: Evolution of community...,Paper,"In creating an ethnography about the City, Tex...",1.0,2003,,,Audra Diers,2003-187b065317e6cc


## Check whether Title is unique

In [64]:
df.shape[0]

27466

In [65]:
len(df.Title.unique())

27443

No. They are not. 

In [70]:
title_paper_ids = df.apply(lambda row: row['Title'] + "-" + str(row['Paper ID']), axis = 1)

In [71]:
len(title_paper_ids.unique()) == df.shape[0]

True

So we can generate a dic where key is 'title-paper-id' and value is the new paper_id. The purpose is to replace 