In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
%matplotlib inline

In [2]:
pd.options.display.float_format = '{:.2f}'.format

# A function to calculate and pull together the information for the challenge.
def summarycreator(df, varname, colname):
    # The names of the first 7 most commonly bought journals.
    big7journals = df[colname].value_counts().reset_index().iloc[0:7,0]
    list = df[df[colname].isin(big7journals)]       \
                         .groupby(colname)[varname] \
                         .agg({
                             'meancost' : np.mean,
                             'mediancost' : np.median,
                             'stdcost' : np.std,
                             'count_articles' : len,
                         })
    return(list)


masterDF = pd.read_csv('C:/Users/Bethany/Documents/Home/Thinkful/Data cleaning example/WELLCOME_APCspend2013_forThinkful.csv',encoding = 'ISO-8859-1')

# Renaming the columns because their original names are hideously unworkable.
masterDF.columns = ['ID', 'Publisher', 'Journal', 'Article', 'Cost']

FileNotFoundError: File b'C:/Users/Bethany/Documents/Home/Thinkful/Data cleaning example/WELLCOME_APCspend2013_forThinkful.csv' does not exist

Let's clean up the cost variable

In [None]:
# Start with a function to cope with the fact that some values do not have a
# currency indicator.
def extractnum(item):
    value = None
    item = re.findall(r'[£\$€]', str(item))
    if item:
        value = item[0]
    return value


# Separating the Cost variable into a currency indicator and two float value
# variables, one to keep the raw value and another to hold the processed.
masterDF['Currency'] = masterDF['Cost'].apply(extractnum)
masterDF['CostNum'] = masterDF['Cost'].apply(lambda x: float(str(x).strip(r'[£\$€]')))
masterDF['CostRaw'] = masterDF['Cost'].apply(lambda x: float(str(x).strip(r'[£\$€]')))

#print(masterDF.head())
#Looking at the raw uncleaned data.  Clearly there are some things to fix.
#masterDF['CostNum'].hist()
#plt.show()
#print(masterDF.describe())


# Setting '999999' values to 'missing'.
masterDF.loc[masterDF['CostNum'] == 999999, 'CostNum'] = None

# Fixing decimal point shift for max outliers assuming a decimal point is
# erroneously missing.
masterDF.loc[masterDF['CostNum'] > 8000, 'CostNum'] = masterDF.loc[masterDF['CostNum'] > 8000, 'CostNum'] / 100

# Setting as NA values of 0, because all open access journals charge something.
masterDF.loc[masterDF['CostNum'] == 0, 'CostNum'] = None

# Fixing decimal point shift for min outliers – assuming a decimal point is
# erroneously misplaced.
masterDF.loc[masterDF['CostNum'] < 80, 'CostNum'] = masterDF.loc[masterDF['CostNum'] < 80, 'CostNum'] * 100

# Converting dollar amounts to pounds using the 2.13.17 conversion rate of 0.80.
masterDF.loc[masterDF['Currency'] == '$', 'CostNum'] = masterDF.loc[masterDF['Currency'] == '$', 'CostNum'] * 0.8



#Looking again.  Much nicer.
#print(masterDF.describe())

#masterDF['CostNum'].hist()
#plt.show()

On to tidying up journal names.

In [None]:
# Creating a list of journal names in lowercase, to identify entries with the
# same words but different caps values.
journal_counts = masterDF['Journal'].apply(lambda x: str(x).lower()) \
                                    .value_counts()                  \
                                    .reset_index()                   \
                                    .sort_values(by='index')         \

# When the same journal is named inconsistently, we will assign it a
# 'TrueName.'However, we assume most journals are named correctly.
journal_counts['TrueName'] = journal_counts['index']
uniquejournals_raw = len(journal_counts['TrueName'].unique())

# Begin by replacing & with 'and'.
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace(' & ', ' and '))

# Now remove all punctuation as visual inspection suggests it is used inconsistently.
translator = str.maketrans('', '', string.punctuation)
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.translate(translator))

# Expand common abbreviations
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('j ', 'journal '))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: re.sub('^am ', 'american ',x))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('bmj', 'british medical journal'))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('int ', 'international '))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('eur ', 'european '))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('jnl', 'journal'))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace(' 1', ' one'))

# Fixing one specific thing I noticed.
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('plosone', 'plos one'))

# Remove sometimes-omitted words 'of' and 'the'.
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('of ',''))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.replace('the ',''))

# Cleaning up extra whitespaces.
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: re.sub("\s\s+", " ", x))
journal_counts['TrueName'] = journal_counts['TrueName'].apply(lambda x: x.strip())

# Creating a dictionary mapping the old journal values to the new ones.
journal_dict = pd.Series(
    journal_counts['TrueName'].values,
    index=journal_counts['index']
).to_dict()

# Applying the dictionary to the data.
masterDF['JournalClean'] = masterDF['Journal'].apply(lambda x: str(x).lower()) \
                                              .map(journal_dict)

uniquejournals_clean = len(journal_counts['TrueName'].unique())

### Number of unique journals before and after cleaning

In [None]:
print(uniquejournals_raw)
print(uniquejournals_clean)

## Raw uncleaned data

In [None]:
print(summarycreator(masterDF,'CostRaw','Journal'))

## Numeric variable cleaned

In [None]:
print(summarycreator(masterDF,'CostNum','Journal'))

## Final version with numeric and string variables cleaned

In [None]:
print(summarycreator(masterDF,'CostNum','JournalClean'))


Note that this is not meant to be the only possible acceptable cleaning procedure!  Some variance around these values is to be expected.  