In [1]:
import pandas as pd
import re

In [2]:
# Pandas would not load file with defaults, needed to specifiy encoding
trust = pd.read_csv("WELLCOME_APCspend2013_forThinkful.csv", encoding='latin1')

trust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
PMID/PMCID                                             1928 non-null object
Publisher                                              2127 non-null object
Journal title                                          2126 non-null object
Article title                                          2127 non-null object
COST (£) charged to Wellcome (inc VAT when charged)    2127 non-null object
dtypes: object(5)
memory usage: 83.2+ KB


In [3]:
trust.sample(n=10)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
2037,3654571,Wiley Subscription Services Inc.,Dev. World Bioeth,Engaging communities to strengthen research et...,£1834.77
2091,3764630,Wiley-Blackwell,Journal of Physiology,Burst activity and ultrafast activation kineti...,£2023.54
61,23239883 PMC3561570,American Society for Biochemistry and Molecula...,Journal of Biological Chemistry,Molecular architecture and functional analysis...,£2259.64
1373,PMCID: PMC3715432,Public Library of Science,PLoS Genetics,FGF Signalling Regulates Chromatin Organisatio...,£1456.19
2057,PMC3582024,Wiley-Blackwell,Annals of Neurology,Ischaemic stroke is associated with the ABO lo...,£2276.48
1737,"Epub ahead of print pub Feb 2013, print in press",Springer,Cellular and Molecular Life Sciences,Developmental mechanisms directing early anter...,£2226.74
1865,22962270 PMC3491872,The Sheridan Press,Cancer Research,T cell trafficking facilitated by high endothe...,£799.44
749,PMC2779337,Elsevier,Nitric Oxide,What is the real physiological NO concentratio...,£2220.99
2009,PMC3853540,Wiley,Brt Jnl Clinical Pharmacology,The association between Parkinson's disease an...,£2009.65
1256,23981980 PMC3809720,Oxford University Press,Oxford University Press,Physical activity levels across adult life and...,£999999.00


In [4]:
counts = trust['Journal title'].value_counts()
counts.head(n=20)

PLoS One                                           92
PLoS ONE                                           62
Journal of Biological Chemistry                    48
Nucleic Acids Research                             21
Proceedings of the National Academy of Sciences    19
PLoS Neglected Tropical Diseases                   18
Human Molecular Genetics                           18
Nature Communications                              17
Neuroimage                                         15
PLoS Genetics                                      15
PLoS Pathogens                                     15
PLOS ONE                                           14
Brain                                              14
NeuroImage                                         14
BMC Public Health                                  14
Movement Disorders                                 13
Biochemical Journal                                12
Journal of Neuroscience                            12
Developmental Cell          

In [5]:
trust['Journal title'] = trust['Journal title'].astype(str)
journals = list(trust['Journal title'].unique())
journals.sort()
# len(journals)
len(journals)

985

In [6]:
# Create a dict to hold the Journal title and regex for matching and correcting
journal_titles = {'Plus one':r'(PLo[Ss]ONE|(PLoS.1)|(PL[oO]S).O[Nn][eE])', 
'Journal of Biologial Chemistry':r'.*(B[Ii][Oo][Ll])\w*.(C[Hh][Ee][Mm]).*', 
'Nucleic Acids Research':r'(N[Uu][Cc][Ll][Ee][Ii][Cc]).(A[Cc][Ii][Dd]|[Ss]).*(R[Ee][Ss][Ee][Aa][Rr][Cc][Hh]).*', 
'Proceedings of the National Academy of Sciences':r'(PNAS).*|(Proc).*(the [Nn]ational Academy of Sciences).*|(Proc).*(U\sS\sA.)', 
'PLoS Neglected Tropical Diseases':r'(P[Ll][Oo]S).(Neglected).*|(P[Ll][Oo]S).(Negected).*', 
'Human Molecular Genetics':r'(Human)\s*([Mm]ol).*', 
'Nature Communications':r'(N[Aa][Tt][Uu][Rr][Ee]).([Cc]ommunications)|(N[Aa][Tt][Uu][Rr][Ee]).([Cc]OMMUNICATIONS)', 
'Plos Pathogens':r'(P[Ll][Oo][Ss]).(Pathogens)', 
'Plos Genetics':r'(P[Ll][Oo][Ss]).(Genetics)|(P[Ll][Oo][Ss]).(GENETICS)',
'Neuro Image':r'(Neuro).(mage)'}

In [7]:
for item in journal_titles.items():
    for i in range(0,2126):
        if re.match(item[1],str(trust.loc[i,'Journal title'])):
            trust.loc[i,'Journal title'] = item[0]

In [8]:
counts = trust['Journal title'].value_counts()
counts.head(n=20)

Plus one                                           194
Journal of Biologial Chemistry                      71
Proceedings of the National Academy of Sciences     37
Neuro Image                                         36
Nucleic Acids Research                              29
Plos Genetics                                       24
Plos Pathogens                                      24
PLoS Neglected Tropical Diseases                    21
Human Molecular Genetics                            21
Nature Communications                               19
Brain                                               14
BMC Public Health                                   14
Movement Disorders                                  13
Developmental Cell                                  12
Journal of Neuroscience                             12
Biochemical Journal                                 12
Journal of General Virology                         11
Current Biology                                     10
BMJ       

In [9]:
# Top Journals with article counts
top_journals = counts.head(n=5)
top_journals

Plus one                                           194
Journal of Biologial Chemistry                      71
Proceedings of the National Academy of Sciences     37
Neuro Image                                         36
Nucleic Acids Research                              29
Name: Journal title, dtype: int64

In [10]:
top_five = [index[0] for index in top_journals.items()]
top_five

['Plus one',
 'Journal of Biologial Chemistry',
 'Proceedings of the National Academy of Sciences',
 'Neuro Image',
 'Nucleic Acids Research']

In [31]:
for e in top_five:
    e_df = trust[trust["Journal title"] == e]

    for i in range(0,len(e_df)):
        x = e_df['COST (£) charged to Wellcome (inc VAT when charged)'].iloc[i]
        e_df['COST (£) charged to Wellcome (inc VAT when charged)'].iloc[i] = float(x[1:])

    print(f"""
Publication: {e}: 
    articles: {len(e_df)}
    cost mean: {e_df['COST (£) charged to Wellcome (inc VAT when charged)'].mean():0.2f}
    cost median: {e_df['COST (£) charged to Wellcome (inc VAT when charged)'].median():0.2f}
    cost standard deviation: {e_df['COST (£) charged to Wellcome (inc VAT when charged)'].std():0.2f}
""")


Publication: Plus one: 
    articles: 194
    cost mean: 43112.52
    cost median: 897.40
    cost standard deviation: 199438.00


Publication: Journal of Biologial Chemistry: 
    articles: 71
    cost mean: 29513.70
    cost median: 1314.53
    cost standard deviation: 166402.84


Publication: Proceedings of the National Academy of Sciences: 
    articles: 37
    cost mean: 27779.03
    cost median: 732.25
    cost standard deviation: 164272.16


Publication: Neuro Image: 
    articles: 36
    cost mean: 2057.32
    cost median: 2289.24
    cost standard deviation: 466.87


Publication: Nucleic Acids Research: 
    articles: 29
    cost mean: 1162.34
    cost median: 852.00
    cost standard deviation: 442.15

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
