In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('wellcome_thinkful_u.csv')
df.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
PMID/PMCID                                             1928 non-null object
Publisher                                              2127 non-null object
Journal title                                          2126 non-null object
Article title                                          2127 non-null object
COST (£) charged to Wellcome (inc VAT when charged)    2127 non-null object
dtypes: object(5)
memory usage: 83.2+ KB


In [7]:
#renamed the columns for ease of manipulation.
df=df.rename(columns={'PMID/PMCID':'pmid_pmcid','Publisher':'publisher','Journal title':'title','Article title':'art_title','COST (£) charged to Wellcome (inc VAT when charged)':'cost'})
df.head()

Unnamed: 0,pmid_pmcid,publisher,title,art_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [11]:
#removed symbols that did not allow calculations to be completed on the cost column.
df["cost"]=df["cost"].str.replace("£","")
df["cost"]=df["cost"].str.replace("$","").astype(float)
df.head()


Unnamed: 0,pmid_pmcid,publisher,title,art_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,0.0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,685.88


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
pmid_pmcid    1928 non-null object
publisher     2127 non-null object
title         2126 non-null object
art_title     2127 non-null object
cost          2127 non-null float64
dtypes: float64(1), object(4)
memory usage: 83.2+ KB


In [14]:
#it did not appear there was many 'duplicate' values so I dropped them. 
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2127 entries, 0 to 2126
Data columns (total 5 columns):
pmid_pmcid    1928 non-null object
publisher     2127 non-null object
title         2126 non-null object
art_title     2127 non-null object
cost          2127 non-null float64
dtypes: float64(1), object(4)
memory usage: 99.7+ KB


In [16]:
#Again it did not appear that there was many "null" values so I dropped them as well.
print(df.dropna())


                            pmid_pmcid  \
1                           PMC3679557   
2                23043264  PMC3506128    
3                  23438330 PMC3646402   
4                 23438216 PMC3601604    
5                           PMC3579457   
6                           PMC3709265   
7                 23057412 PMC3495574    
8                    PMCID: PMC3780468   
9                    PMCID: PMC3621575   
10                   PMCID: PMC3739413   
11                   PMCID: PMC3530961   
12                   PMCID: PMC3624797   
13                          PMC3413243   
14                          PMC3694353   
15                          PMC3572711   
16                            22610094   
17                   PMCID: PMC3586974   
18        23455506  PMCID: PMC3607399    
19          PMID: 24015914 PMC3833349    
20                       : PMC3805332    
22            PMCID:\n    PMC3656742\n   
23                      PMCID: 3584654   
24                 22971149 PMC346

In [22]:
#The columns had a lot of whitespace and the titles and article titles were in caps or not so I made them all the same by 
#capatilizing all of the entries.
for column in df.columns[1:4]:
    print(column)
    df[column]=df[column].str.strip()
    df[column]=df[column].str.capitalize()   

publisher
title
art_title


In [23]:
df.head(30)

Unnamed: 0,pmid_pmcid,publisher,title,art_title,cost
0,,Cup,Psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0
1,PMC3679557,Acs,Biomacromolecules,Structural characterization of a model gram-ne...,2381.04
2,23043264 PMC3506128,Acs,J med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56
3,23438330 PMC3646402,Acs,J med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64
4,23438216 PMC3601604,Acs,J org chem,Regioselective opening of myo-inositol orthoes...,685.88
5,PMC3579457,Acs,Journal of medicinal chemistry,Comparative structural and functional studies ...,2392.2
6,PMC3709265,Acs,Journal of proteome research,Mapping proteolytic processing in the secretom...,2367.95
7,23057412 PMC3495574,Acs,Mol pharm,Quantitative silencing of egfp reporter gene b...,649.33
8,PMCID: PMC3780468,Acs (amercian chemical society) publications,Acs chemical biology,A novel allosteric inhibitor of the uridine di...,1294.59
9,PMCID: PMC3621575,Acs (amercian chemical society) publications,Acs chemical biology,Chemical proteomic analysis reveals the drugab...,1294.78


In [35]:
mean = df['cost'].mean()
median = df['cost'].median()
std_dev = df['cost'].std()
print("Cost mean, median and std_dev")
print(mean, median, std_dev)

Cost mean, median and std_dev
24067.33997179131 1884.01 146860.6655590476


In [42]:
pub_counts = df.title.value_counts()
pub_counts[:5]

Plos one                           190
Journal of biological chemistry     53
Neuroimage                          29
Nucleic acids research              26
Plos pathogens                      24
Name: title, dtype: int64

In [43]:
print(df['title'].unique())

['Psychological medicine' 'Biomacromolecules' 'J med chem' 'J org chem'
 'Journal of medicinal chemistry' 'Journal of proteome research'
 'Mol pharm' 'Acs chemical biology'
 'Journal of chemical information and modeling' 'Biochemistry'
 'Gastroenterology' 'Journal of biological chemistry'
 'Journal of immunology' 'Acs chemical neuroscience' 'Acs nano'
 'American chemical society' 'Analytical chemistry'
 'Bioconjugate chemistry' 'Journal of the american chemical society'
 'Chest' 'Journal of neurophysiology' 'Journal of physiology'
 'The journal of neurophysiology' 'American journal of psychiatry'
 'Americal journal of psychiatry' 'Behavioral neuroscience' 'Emotion'
 'Health psychology' 'Journal of abnormal psychology'
 'Journal of consulting and clinical psychology'
 'Journal of experimental psychology:  animal behaviour process'
 'Journal of experimental psychology: human perception and performance'
 'Journal of family psychology' 'Psychological assessment'
 'Psychological review'
 'P