In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import os
os.path.isfile('/home/kaneeshadawood/Challenge.csv')

True

In [3]:
# Loads data from the file
data = pd.read_csv('/home/kaneeshadawood/Challenge.csv', encoding='latin-1')
data.head() 

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [4]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 5 columns):
PMID/PMCID                                             1928 non-null object
Publisher                                              2127 non-null object
Journal title                                          2126 non-null object
Article title                                          2127 non-null object
COST (£) charged to Wellcome (inc VAT when charged)    2127 non-null object
dtypes: object(5)
memory usage: 83.2+ KB


In [5]:
data.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

# Data Cleaning

In [6]:
#1 Renaming columns
data.rename(columns={'COST (£) charged to Wellcome (inc VAT when charged)':'Cost', 'PMID/PMCID':'Index'}, inplace=True)

In [7]:
#2 Removing duplicates.
data.drop_duplicates(keep = False, inplace = True) 

In [8]:
#3 Remove missing values
data.dropna(subset=['Index','Publisher','Journal title','Article title'])
data['Index'] = pd.to_numeric(data['Index'], errors='coerce') 

In [10]:
#4 Removing symbols in Cost
data['Cost'] = data['Cost'].astype(str).str.replace('£', '')
data['Cost'] = data['Cost'].astype(str).str.replace('$', '') 

In [23]:
#5 Converting to Object datatype to Float 
data.Cost = data.Cost.map(lambda x: float(x))

In [24]:
#6 Replace NaN values with 0
data['Index'].fillna(0, inplace=True)

In [25]:
data.head() 

Unnamed: 0,Index,Publisher,Journal title,Article title,Cost
0,0.0,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,0.0
1,0.0,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04
2,0.0,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56
3,0.0,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64
4,0.0,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,685.88


In [26]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2127 entries, 0 to 2126
Data columns (total 5 columns):
Index            2127 non-null float64
Publisher        2127 non-null object
Journal title    2126 non-null object
Article title    2127 non-null object
Cost             2127 non-null float64
dtypes: float64(2), object(3)
memory usage: 99.7+ KB


# Exploration

# 1. Five most common journals and The total articles for each

In [17]:
journals = data.groupby('Journal title')[['Article title']].count().sort_values(by=['Article title'],ascending=False)
journals.columns = ['Article Count']
journals.head(5)

Unnamed: 0_level_0,Article Count
Journal title,Unnamed: 1_level_1
PLoS One,92
PLoS ONE,62
Journal of Biological Chemistry,48
Nucleic Acids Research,21
Proceedings of the National Academy of Sciences,19


# 2. Mean, Median, & Standard Deviation of Open-Access Cost Per Article for Each Journal

In [28]:
data['Cost'].describe() 

count      2127.000000
mean      24067.339972
std      146860.665559
min           0.000000
25%        1280.000000
50%        1884.010000
75%        2321.305000
max      999999.000000
Name: Cost, dtype: float64