In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('CleaningChallenge.csv')
df.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [2]:
grouped_by_journals = df.groupby(['Journal title']).count()
grouped_by_journals

Unnamed: 0_level_0,PMID/PMCID,Publisher,Article title,COST (£) charged to Wellcome (inc VAT when charged)
Journal title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACS Chemical Biology,4,5,5,5
ACS Chemical Neuroscience,1,1,1,1
ACS NANO,1,1,1,1
ACS Nano,1,1,1,1
ACTA F,1,1,1,1
AGE,1,1,1,1
AIDS,3,3,3,3
AIDS Behav,1,1,1,1
AIDS Care,2,2,2,2
AIDS Journal,1,1,1,1


In [3]:
grouped_by_journals.sort_values('COST (£) charged to Wellcome (inc VAT when charged)', ascending = False)

Unnamed: 0_level_0,PMID/PMCID,Publisher,Article title,COST (£) charged to Wellcome (inc VAT when charged)
Journal title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PLoS One,91,92,92,92
PLoS ONE,62,62,62,62
Journal of Biological Chemistry,47,48,48,48
Nucleic Acids Research,20,21,21,21
Proceedings of the National Academy of Sciences,19,19,19,19
PLoS Neglected Tropical Diseases,18,18,18,18
Human Molecular Genetics,16,18,18,18
Nature Communications,17,17,17,17
PLoS Pathogens,15,15,15,15
PLoS Genetics,14,15,15,15


In [4]:
#This modifies the titles so that different ways of typing the journal titles will be counted as the same
def fix_titles(x):
    return str(x).upper().replace(',' , '').replace(':' , '').replace('SECTION' , '').replace('ICA ', 'Y ').replace('&', '').replace('AND','').replace('ONLINE', '').replace('1' , 'ONE').replace(' ','')
df['JOURNAL'] = df['Journal title'].apply(fix_titles)
df['count'] = 1
grouped_by_journals = pd.DataFrame(df.groupby(['JOURNAL'])['count'].sum())
grouped_by_journals.sort_values('count', ascending = False)

Unnamed: 0_level_0,count
JOURNAL,Unnamed: 1_level_1
PLOSONE,207
JOURNALOFBIOLOGICALCHEMISTRY,53
NEUROIMAGE,29
NUCLEICACIDSRESEARCH,25
PLOSPATHOGENS,24
PLOSGENETICS,24
PROCEEDINGSOFTHENATIONALACADEMYOFSCIENCES,22
HUMANMOLECULARGENETICS,20
PLOSNEGLECTEDTROPICALDISEASES,20
NATURECOMMUNICATIONS,19


In [11]:
#Makes the cost column usable as a number, allowing for the costs which have been entered in USD to be handled as such
def get_cost(x):
    if(str(x)[:1] == '£'):
        return float(x.replace('£', ''))
    else:
        return float(x.replace('$', ''))/.79
df['cost'] = df['COST (£) charged to Wellcome (inc VAT when charged)'].apply(get_cost)
#gets rid of all values that are more than four standard deviations away from the mean
def remove_outliers(x, mean, st_dev):
    if(x > mean + 4*st_dev):
        return mean
    elif(x < mean - 4*st_dev):
         return mean
    else:
        return x
#Cycles through all unique journals and removes outliers
for journal in df['JOURNAL'].unique():
    costs = df['cost'].loc[df['JOURNAL'] == journal]
    for x in range(4):
        st_dev = np.std(costs)
        mean = np.mean(costs)
        costs = costs.apply(remove_outliers, args = (mean, st_dev))
    print(journal + ", mean: " + str(np.mean(costs)) + ', standard deviation: ' + str(np.std(costs)) + ', median: ' + str(np.median(costs)))

PSYCHOLOGICALMEDICINE, mean: 1580.4, standard deviation: 795.3800600970583, median: 2034.0
BIOMACROMOLECULES, mean: 2381.04, standard deviation: 0.0, median: 2381.04
JMEDCHEM, mean: 656.0999999999999, standard deviation: 13.54000000000002, median: 656.0999999999999
JORGCHEM, mean: 685.88, standard deviation: 0.0, median: 685.88
JOURNALOFMEDICINALCHEMISTRY, mean: 1093.6142857142856, standard deviation: 629.1045915961406, median: 1006.72
JOURNALOFPROTEOMERESEARCH, mean: 1844.915, standard deviation: 523.0349999999999, median: 1844.915
MOLPHARM, mean: 649.33, standard deviation: 0.0, median: 649.33
ACSCHEMICALBIOLOGY, mean: 1418.1860000000001, standard deviation: 453.75146477339337, median: 1294.59
JOURNALOFCHEMICALINFORMATIONMODELING, mean: 1329.69, standard deviation: 0.0, median: 1329.69
BIOCHEMISTRY, mean: 665.64, standard deviation: 0.0, median: 665.64
GASTROENTEROLOGY, mean: 1233.04, standard deviation: 994.96, median: 1233.04
JOURNALOFBIOLOGICALCHEMISTRY, mean: 1430.2958469071782, 

MEDICALHUMANITIES, mean: 2340.0, standard deviation: 0.0, median: 2340.0
THORAX, mean: 2040.0, standard deviation: 0.0, median: 2040.0
SEXTRANSMINFECT., mean: 1700.0, standard deviation: 0.0, median: 1700.0
POSTGRADUATEMEDICALJOURNAL, mean: 2040.0, standard deviation: 0.0, median: 2040.0
VETERINARYRECORD, mean: 2040.0, standard deviation: 0.0, median: 2040.0
THORAXANINTERNATIONALJOURNALFORRESPIRATORYMEDICINE, mean: 2340.0, standard deviation: 0.0, median: 2340.0
EUROPEANJOURNALOFHEALTHLAW, mean: 779.72, standard deviation: 0.0, median: 779.72
JOURNALOFNEUROSCIENCE, mean: 1844.8646153846153, standard deviation: 594.2118532543161, median: 2029.47
HEARTCIRCULATORYPHYSIOLOGY, mean: 1349.18, standard deviation: 0.0, median: 1349.18
ANTIMICOBIALAGENTSCHEMOTHERAPY, mean: 1831.18, standard deviation: 0.0, median: 1831.18
JCLINMICROBIOL, mean: 1250.66, standard deviation: 0.0, median: 1250.66
INTERNATIONALPSYCHOGERIATRICS, mean: 2034.0, standard deviation: 0.0, median: 2034.0
PARASITOLOGY, mean

JOURNALOFHOSPITALINFECTIONS, mean: 2293.37, standard deviation: 0.0, median: 2293.37
JOURNALOFINFECTION, mean: 2286.09, standard deviation: 0.0, median: 2286.09
JOURNALOFMECHANISMSOFAGEINGDEVELOPMENT, mean: 2696.34, standard deviation: 0.0, median: 2696.34
JOURNALOFMOLECULARBIOLOGY, mean: 2370.0099999999998, standard deviation: 108.06841189727933, median: 2413.915
JOURNALOFNEUROSCIENCEMETHODS, mean: 1931.32, standard deviation: 488.8399999999999, median: 1931.32
JOURNALOFNUTRITIONEDUCATIONBEHAVIOUR, mean: 2321.23, standard deviation: 0.0, median: 2321.23
JOURNALOFPAEDIATRICUROLOGY, mean: 999999.0, standard deviation: 0.0, median: 999999.0
JOURNALOFPROTEOMICS, mean: 2725.885, standard deviation: 254.47500000000014, median: 2725.885
JOURNALOFSTRUCTURALBIOLOGY, mean: 2031.54, standard deviation: 159.0236079957941, median: 1939.95
JOURNALOFTHEAMERICANACADEMYOFCHILDADOLESCENTPSYCHIATRY, mean: 2409.8, standard deviation: 0.0, median: 2409.8
JOURNALOFTHEAMERICANCOLLEGEOFCARDIOLOGY, mean: 1528

THERANOSTICS, mean: 543.29, standard deviation: 0.0, median: 543.29
JMEDINTERNETRESEARCH, mean: 1303.21, standard deviation: 0.0, median: 1303.21
AMERICANJOURNALOFMEDICALGENETICS, mean: 2307.01, standard deviation: 0.0, median: 2307.01
GENETICEPIDEMIOLOGY, mean: 2335.16, standard deviation: 35.99885831522997, median: 2345.75
THEJOURNALOFPATHOLOGY, mean: 2248.94, standard deviation: 0.0, median: 2248.94
EUROPEANJOURNALOFIMMUNOLOGY, mean: 2134.93375, standard deviation: 261.86973015879005, median: 2262.825
JOURNALOFDEPRESSIONANXIETY, mean: 2321.38, standard deviation: 0.0, median: 2321.38
JOURNALOFPATHOLOGY, mean: 1891.2866666666669, standard deviation: 317.0458448798148, median: 1897.33
CLINICALEXPERIMENTALALLERGY, mean: 2329.11, standard deviation: 0.0, median: 2329.11
EMBOMOLECULARMEDICINE, mean: 2733.146666666667, standard deviation: 350.6399396658756, median: 2868.08
BIOPOLYMERS, mean: 1981.745, standard deviation: 0.024999999999977263, median: 1981.745
DEVELOPMENTSCIENCE, mean: 235

ASNNEURO, mean: 1453.42, standard deviation: 0.0, median: 1453.42
BIOCHEMICALJOURNAL, mean: 1920.8333333333333, standard deviation: 143.55360980793512, median: 1800.0
BIOCHEMICALJOURNALS, mean: 1800.0, standard deviation: 0.0, median: 1800.0
BIOCHEMICALSOCIETYTRANSACTIONS, mean: 1600.0, standard deviation: 509.9019513592785, median: 1800.0
ESSAYSINBIOCHEMISTRY, mean: 270.0, standard deviation: 0.0, median: 270.0
THEBIOCHEMICALJOURNAL, mean: 1800.0, standard deviation: 0.0, median: 1800.0
BIOCHEMSOCTRANS, mean: 1800.0, standard deviation: 0.0, median: 1800.0
BIOSCIENCEREPORTS, mean: 834.0, standard deviation: 0.0, median: 834.0
PROCEEDINGSOFTHENATIONALACADEMYOFSCIENCES(PNAS), mean: 831.09, standard deviation: 0.0, median: 831.09
BIOLOGY, mean: 1836.91, standard deviation: 0.0, median: 1836.91
COMPUTATIONALBIOLOGY, mean: 1679.71, standard deviation: 0.0, median: 1679.71
NEGLECTEDTROPICALDISEASE, mean: 3600.0, standard deviation: 0.0, median: 3600.0
PLOS, mean: 1037.25, standard deviation

ETHNHEALTH, mean: 1987.56, standard deviation: 0.0, median: 1987.56
AMERICANJOURNALOFBIOETHICS--NEUROSCIENCE, mean: 2465.77, standard deviation: 0.0, median: 2465.77
SOUTHASIANHISTORYCULTURE, mean: 2145.6, standard deviation: 0.0, median: 2145.6
THEHISTORYOFTHEFAMILY, mean: 2316.79, standard deviation: 0.0, median: 2316.79
JOURNALOFEXPERIMENTALNANOSCIENCE, mean: 2474.16, standard deviation: 0.0, median: 2474.16
JMEDCHEM., mean: 1178.72, standard deviation: 336.3663898935207, median: 1276.8600000000001
JOURNALOFAPPLIEDPHYSIOLOGY, mean: 1288.0, standard deviation: 0.0, median: 1288.0
PEDIATRICS, mean: 2645.85, standard deviation: 0.0, median: 2645.85
BIOLOPEN, mean: 1014.74, standard deviation: 0.0, median: 1014.74
JOURNALOFCLINICALENDOCRINOLOGY, mean: 1970.57, standard deviation: 0.0, median: 1970.57
JOURNALODCLINICALENDOCRINOLOGY, mean: 3602.41, standard deviation: 0.0, median: 3602.41
THEJOURNALOFVISUALIZEDEXPERIMENTS, mean: 1424.71, standard deviation: 0.0, median: 1424.71
BJP, mean: