# Cleaning Data PD


This notebook is used in order to clean the metadata retrieved with the software Arcas.

In [1]:
import glob
import pandas as pd


In [2]:
dfs = []
for filename in glob.glob("../data/PD_*.json"):
    dfs.append(pd.read_json(filename))


In [3]:
dfs.append(pd.read_json("../data/bibliography.json"))


In [4]:
df = pd.concat(dfs, ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
df.provenance.unique()


array(['Springer', 'Nature', 'PLOS', 'IEEE', 'arXiv', 'Manual'],
      dtype=object)

In [6]:
len(df.title.unique()), len(df.unique_key.unique())


(3096, 3193)

In [7]:
provenance_size = (
    df.groupby(["unique_key", "provenance"])
    .size()
    .reset_index()
    .groupby("provenance")
    .size()
)
provenance_size


provenance
IEEE         295
Manual        79
Nature       687
PLOS         482
Springer     576
arXiv       1074
dtype: int64

In [8]:
df = df[~(df["date"] < 1950)]
df = df[~(df["date"] > 2018)]


In [9]:
df = df.replace(to_replace=2021, value=2015)


In [10]:
df.to_json("../data/pd_November_2018.json")


Duplicate articles
------------------

In [2]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

In [3]:
table = (
    df.groupby(["title", "unique_key"]).size().reset_index().groupby("title").count()
)
duplicates = table[table["unique_key"] > 1]


In [4]:
duplicates_title = df[df["title"].isin(duplicates.index)]["title"].unique()


In [5]:
duplicates_title 

array(['Agent-Based Models', 'Coordination Problems and Communication',
       'Group Selection', 'Prisoner’s Dilemma', 'Game Theory',
       'Learning and Evolution in Games: ESS',
       'Some Economics of International Climate Policy',
       'Segregation and Strategic Neighborhood Interaction',
       'Economic Governance', 'The Prisoner’s Dilemma',
       'Biology and Evolutionary Games',
       'Strategic and Extensive Form Games',
       'Computer Science and Game Theory',
       'Nash  John Forbes (Born 1928)', 'Selten  Reinhard (Born 1930)',
       'Cooperation', 'Economy as a Complex System',
       'IQ and National Productivity', 'Social Capital',
       'Collective Rationality', 'Strategic Trade Policy',
       'Neuroeconomics', 'A Prisoner’s Dilemma',
       'Spatial Patterns of Prisoner’s Dilemma Game in Metapopulations',
       'Rationality  History of the Concept', 'Game Theory and Biology',
       'Public Goods Experiments', 'Altruism in Experiments',
       'Reputatio

In [6]:
duplicates_in_arxiv = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["title"].unique()


In [7]:
diff = list(set(duplicates_title) - set(duplicates_in_arxiv))


In [15]:
df_without_arxiv = df[~(df["provenance"] == "arXiv")]


In [16]:
df_without_arxiv = df_without_arxiv.drop_duplicates(subset="title")


In [17]:
df_without_arxiv.to_json("../data/pd_November_2018_without_arxiv.json")


**Drop duplicates.**

In [18]:
articles_to_drop = df[
    (df["title"].isin(duplicates.index)) & (df["provenance"] == "arXiv")
]["unique_key"].unique()


In [19]:
df = df[~df["unique_key"].isin(articles_to_drop)]


In [20]:
len(df["title"].unique()), len(df["unique_key"].unique())


(3077, 3155)

**Export clean json.**

In [21]:
df.to_json("../data/pd_November_2018_clean.json")


In [22]:
df.to_csv('../data/prisoners_dilemma_articles_meta_data.csv')

In [4]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv', index_col=0)

In [76]:
len(df['unique_key'].unique())\

2500

In [77]:
words = ["prisoner's dilemma",
         "prisoner’s dilemma",
         "prisoners dilemma",
         "prisoners' dilemma",
         "prisoners evolution",
         "prisoner dilemma",
         "prisoner game theory",
         "equilibria",
         "cooperation",
         'reciprocity',
         "tit-for-tat",
         "tit for tat",
         "zero-determinant",
         "evolutionary",
         "altruism",
         "strategy",
         "strategies",
         "extortion",
         "cooperative",
         "rational",
         "evolutionarily stable"
]

In [78]:
columns = ['title', 'abstract', 'unique_key', 'primary_category', 'category']

In [79]:
temp = df[columns].drop_duplicates()

In [80]:
temp = temp.fillna('Not given')

In [81]:
not_remove = [
'Rosalind’s Ghost: Biology, Collaboration, and the Female',
'How human location-specific contact patterns impact spatial transmission between populations?',
'A suicide-based typology of suicide terrorists: Conventional, coerced, escapist and indirect',
'Unauthorized Horizontal Spread in the Laboratory Environment: The Tactics of Lula, a Temperate Lambdoid Bacteriophage of <i>Escherichia coli</i>',
'Model-Selection-Based Approach for Calculating Cellular Multiplicity of Infection during Virus Colonization of Multi-Cellular Hosts',
'Experimental Evolution of <i>Trichoderma citrinoviride</i> for Faster Deconstruction of Cellulose',
'Interpopulational differences in the use of the Tit-For-Tat strategy during predator inspection in the guppy ',
'Cooperate to accumulate',
'A simulation of moral behavior within marketing exchange relationships',
'Truthful Channel Sharing for Self Coexistence of Overlapping Medical Body Area Networks',
'Finding overlapping communities in multilayer networks',
'Conditions for the Emergence of Shared Norms in Populations with Incompatible Preferences',
'The Impact of Payoff Function and Local Interaction on the ',
'Modelling and Computation in the Valuation of Carbon Derivatives with Stochastic Convenience Yields',
'Dissertation abstract: A status theory of collective action',
'Managing the climate commons at the nexus of ecology, behaviour and economics',
'The Emergence of Groups and Inequality through Co-Adaptation',
'Pursuing the big questions about interspecific mutualism: a review of theoretical approaches',\
'Tunable Stochastic Pulsing in the <i>Escherichia coli</i> Multiple Antibiotic Resistance Network from Interlinked Positive and Negative Feedback Loops',
'The Migrations of Human Populations',
'Reputation',
'Public Goods Experiments',
'From the Editors: Student samples in international business research',
'The foreign language effect on the self-serving bias: A field experiment in the high school classroom',
'the logic of animal conflict',
'Dynamics of International Relations: Conflict and Mutual Gain in an Era of Global Interdependence',
'Less is more: rarity trumps quality in luxury markets',
'Risk Management and Business Ethics: Integrating the Human Factor',
'Delay and Probability Discounting of Sexual and Monetary Outcomes in Individuals with Cocaine Use Disorders and Matched Controls',
'Investigation on law and economics of listed companies’ financing preference based on complex network theory',
'Why we sometimes punish the innocent: The role of group entitativity in collective punishment',
'Early warning signs for saddle-escape transitions in complex networks',
'Self-disclosure at international cartels',
'Interrelations of Graph Distance Measures Based on Topological Indices',
'Development of shared information in communication despite hippocampal amnesia',
'Selfish and Altruistic Bacterial Populations Maximize Fitness Under Stress by Local Segregation',
'Modeling crowdsourcing as collective problem solving',
'Predators promote defence of rhizosphere bacterial populations by selective feeding on non-toxic cheaters',
'Ten Simple Rules for Protecting Research Integrity',
'Agent-Based Models',
'Good Fences: The Importance of Setting Boundaries for Peaceful Coexistence',
'Perverted Research and the Political Imagination – The Trial of the Good Scholar Švejk',
'Presenting an Approach for Conducting Knowledge Architecture within Large-Scale Organizations',
'Multi-agent modelling for revenue management',
'Dynamics of a producer-freeloader ecosystem on the brink of collapse',
'Leading from the Centre: A Comprehensive Examination of the Relationship between Central Playing Positions and Leadership in Sport',
'Impact of Degree Heterogeneity on Attack Vulnerability of Interdependent Networks',
'Time Preferences and Natural Resource Extraction Behavior: An Experimental Study from Artisanal Fisheries in Zanzibar',
'Molecular Biopolitics, Somatic Ethics and the Spirit of Biocapital',
'Specific and Individuated Death Reflection Fosters Identity Integration',
'Differences in Collaboration Patterns across Discipline, Career Stage, and Gender',
'Search, Memory, and Choice Error: An Experiment',
'Coping with Bullying in the Classroom Through Agent-Based Modeling']

In [82]:
def word_in_text(row, word, not_remove):
    if row.title in not_remove:
        return True
    return (word in row.title.lower() or word in row.abstract.lower()
                or word in row.primary_category.lower() or word in row.category.lower())

In [83]:
for i, word in enumerate(words):
    values = []
    for j, row in temp.iterrows():
        values.append(word_in_text(row, word, not_remove))
    temp[f'key_{i}'] = values

In [84]:
temp.columns

Index(['title', 'abstract', 'unique_key', 'primary_category', 'category',
       'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7',
       'key_8', 'key_9', 'key_10', 'key_11', 'key_12', 'key_13', 'key_14',
       'key_15', 'key_16', 'key_17', 'key_18', 'key_19', 'key_20'],
      dtype='object')

In [85]:
check = sum([temp[f'key_{i}'].values for i in range(len(words))])
temp['check'] = check

In [86]:
keys = temp.groupby('unique_key')['check'].max()
list_of_keys = keys[keys.values == False].index

In [87]:
len(list_of_keys.unique())

256

In [88]:
to_clean = temp[temp['unique_key'].isin(list_of_keys.unique())]

In [89]:
to_clean[['title', 'abstract', 'unique_key']].drop_duplicates().reset_index(drop=True).to_csv('../data/to_clean.csv')

In [7]:
keys_to_delete = pd.read_csv('../data/to_delete.csv', encoding = "ISO-8859-1")

In [9]:
len(keys_to_delete)

30

In [37]:
keys_to_clean = list(set(list_of_keys.unique()) - set(keys_to_delete['unique_key'].unique()))

In [8]:
len(keys_to_clean)

NameError: name 'keys_to_clean' is not defined

In [30]:
df_with_keys = df[df['unique_key'].isin(list(keys_to_clean))]

In [31]:
df_with_keys = df_with_keys.drop_duplicates()

In [32]:
df_with_keys = df_with_keys[columns].drop_duplicates()

In [33]:
len(df_with_keys)

412

**After cleaning**

In [10]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

In [11]:
keys_to_delete = pd.read_csv('../data/to_delete.csv', encoding = "ISO-8859-1")

In [12]:
list_keys_to_delete = keys_to_delete['unique_key'].unique()

In [13]:
len(list_keys_to_delete)

30

In [14]:
df_without_keys = df[~df['unique_key'].isin(list(list_keys_to_delete))]

In [15]:
len(df_without_keys['unique_key'].unique())

2470

In [16]:
df_without_keys.to_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

**dates**

In [6]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

In [21]:
df[np.isnan(df['date'])]['title'].unique()

array(["The Iterated Prisoner's Dilemma",
       'Game Theory: A Classical Introduction, Mathematical Games, and the Tournament',
       "Solving a Complex Prisoner's Dilemma with Self-Modifying Policies",
       'Essentials of Game Theory: A Concise Multidisciplinary Introduction',
       "Quasi-Stable States in the Iterated-Prisoner's Dilemma"],
      dtype=object)

In [22]:
missing_dates = [('Game Theory: A Classical Introduction, Mathematical Games, and the Tournament', 2017),
                 ("Solving a Complex Prisoner's Dilemma with Self-Modifying Policies", 1999),
                 ('Essentials of Game Theory: A Concise Multidisciplinary Introduction', 2008),
                 ("Quasi-Stable States in the Iterated-Prisoner's Dilemma", 2004)]

In [38]:
df.iloc[0, 4]

1994.0

In [39]:
for title, year in missing_dates:
    temp = df[df['title'] == title]
    for i in temp.index:
        df.iloc[i, 4] = year

In [40]:
df[np.isnan(df['date'])]['title'].unique()

array(["The Iterated Prisoner's Dilemma"], dtype=object)

In [41]:
df.to_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

**drop title**

In [46]:
title = 'Historical development and current status of organ procurement from death-row prisoners in China'

In [47]:
df = df[df['title'] != title]

In [48]:
df.to_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv')

**checking manual papers**

In [5]:
df = pd.read_csv('../data/prisoners_dilemma_articles_meta_data_clean.csv', index_col=0)

In [8]:
manuals = df[df['provenance'] == 'Manual']

In [15]:
manuals[manuals.author.isin(['Axelrod Robert', 'Robert Axelrod'])].title.unique()

array(['evolution of cooperation without reciprocity',
       "the evolution of strategies in the iterated prisoner's dilemma",
       'launching “the evolution of cooperation”',
       'the further evolution of cooperation',
       "effective choice in the prisoner's dilemma",
       "more effective choice in the prisoner's dilemma",
       "how to cope with noise in the iterated prisoner's dilemma",
       'the emergence of cooperation among egoists'], dtype=object)

In [19]:
manuals[manuals.author == 'Joshua Plotkin']

Unnamed: 0,abstract,author,category,date,doi,journal,key,open_access,primary_category,provenance,score,title,unique_key,url,name_check
19080,Not given,Joshua Plotkin,Not given,2012.0,10.1073/pnas.1208087109,proceedings of the national academy of sciences,Not given,Not given,Not given,Manual,Not given,extortion and cooperation in the prisoner’s di...,b9017e30774b4a1b764c12156a7570e7,http://www.pnas.org/content/109/26/10134.short,Joshua Plotkin
