Data Source: https://www.dropbox.com/s/pl5kcrhs2lyj90m/WELLCOME.zip?dl=0

<ol><li>determine the five most common journals and the total articles for each. <li>calculate the mean, median, and standard deviation of the open-access cost per article for each journal <li>identify the open access prices paid by subject area.

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re

In [3]:
endspaces_find = re.compile(r'.*?(\s{1,2})$')
endspaces_remove = re.compile(r'\s$')
symbols_remove = re.compile(r'[^a-zA-Z0-9_\s]')
ampar = re.compile(r'[\&]')

In [4]:
# collects all the words loaded in a text object and breaks them into words using Regex
def words(text): return re.findall(r'\w+', text.lower())


def loadchecker():
    # 1. populates the data set with real corpora...could be expanded to include user's personal content
    # 2. counts the number of times a word occurs in the corpora
    dictionarylist = []

    # main data sample from existing content
    biglist = words(open('../dic/big.txt').read())

    # new words added to .txt
    newwordslist = words(open('../dic/newwords.txt').read())
    
    global WORDS
    WORDS = Counter(biglist + newwordslist)

loadchecker()

In [5]:
def P(word, N=sum(WORDS.values())): 
    # Calculates the frequency count a word occurs in the overall WORDS list (i.e. 'the' has a high percentage)
    return WORDS[word] / N

In [6]:
def correction(word): 
    check = known([word])
    
    if len(check) == 0:
        ans = input('Save new word? {} \n y/n'.format(word))
        if ans == 'y':
            with open("../dic/newwords.txt", "a") as addword:
                addword.write(1000* (word + ' '))
            addword.close()
            loadchecker()
            return word, 1

    # Collects list of possible words only exist in WORDS and checks probability
    wordlist = candidates(word)
    finalcorrection = max(wordlist, key=P)
    
    possiblechoices = {}
    
    for w in wordlist:
        possiblechoices[w] = P(w)
        
    return finalcorrection, possiblechoices

def candidates(word):
    # (1) return word if in WORDS (i.e. correct), 
    # (2) return word if edits1 of word results in word in WORDS (i.e. typo)
    # (3) return word if edits2 on each word in edits1 results in a word in WORDS (i.e. 2 typos)
    # (4) return word if no words in WORDS were found
    
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

In [7]:
def known(words): 
    # checks list of words to see if they exists in the WORDS database
    return set(w for w in words if w in WORDS)

def edits1(word):
    # initialize the letters
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    
    # Create a list of splits for the word so that four processes can be run at each split: (# of splits = length of word)
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    
    # 1. Create a list of one letter deletes from R side of split
    deletes    = [L + R[1:]               for L, R in splits if R]
    # print('Deletes:', deletes,'\n')
    
    # 2. Create a list of words with switched letters using L,R from split for position
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    # print('Transposes:', transposes,'\n')
    
    # 3. Create a list of words with replaced letters from R side of split using 26 letters * number splits
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    # print('Replaces:', replaces,'\n')
    
    # 4. Create a list of words with letter inserts between L and R of split using 26 letters * number splits
    inserts    = [L + c + R               for L, R in splits for c in letters]
    # print('Inserts:', inserts,'\n')
    
    # Returns the unique set of all real and unreal words created from each of the four processes
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    # If edits1 does not return a known word, edits2 check runs a process on each uknown word made in edits1
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [8]:
def stringcorrection(strvalue):
    strvalue = str(strvalue).lower()
    
    if endspaces_find.match(strvalue):
        strvalue = endspaces_remove.sub('', strvalue)

    wordlist = strvalue.split(" ")
    wordlistfinal = []
    
    for word in wordlist:
        word = re.sub(ampar,'and', word)
        word = re.sub(r'[^\w]', '', word)
        
        if word != '':
            word, prob = correction(word)
            wordlistfinal.append(word)

    final = " ".join(wordlistfinal)
    
    return final

In [9]:
welcomedf = pd.read_csv('../data/WELLCOME_APCspend2013_forThinkful.csv',encoding = "ISO-8859-1")

In [None]:
welcomedf['Journalfix1'] = welcomedf['Journal title'].apply(stringcorrection)
welcomedf['Journalfix1'].value_counts()

In [None]:
spelling_errors = {'jounral':'journal', 'heath':'health','molecular':'mol', 'society':'soc','human':'hum','humanan':'human',
                   'biol':'biology', 'chemical':'chem','biolgy':'biology', 'service':'services',
                   'organic':'org','americal':'american','bioohysica':'biophysica','journal':'j'}

In [None]:
pattern = re.compile(r'\b(' + '|'.join(spelling_errors.keys()) + r')\b')

def publisher_fix(seriesvalue):
       
    seriesvalue = seriesvalue.lower()
    
    if endspaces_find.match(seriesvalue):
        seriesvalue = endspaces_remove.sub('', seriesvalue)
    if re.match(r'national academy of sciences', seriesvalue):
        seriesvalue = 'national academy of sciences'
    if re.match(r'acs', seriesvalue):
        seriesvalue = 'acs'
    if re.match(r'nature', seriesvalue):
        seriesvalue = 'nature'
    if re.match(r'cell press', seriesvalue):
        seriesvalue = 'cell press'
    if re.match(r'bmj', seriesvalue):
        seriesvalue = 'bmj'
    if re.match(r'taylor', seriesvalue):
        seriesvalue = 't&f'
    if re.match(r'wolters', seriesvalue):
        seriesvalue = 'wolters kluwer'
    if re.match(r'wiley', seriesvalue):
        seriesvalue = 'wiley'
        
    return seriesvalue


def journal_fix(seriesvalue):
    seriesvalue = str(seriesvalue)
    seriesvalue = seriesvalue.replace("&", "and")
    
    seriesvalue = seriesvalue.lower()
    
    if endspaces_find.match(seriesvalue):
        seriesvalue = endspaces_remove.sub('', seriesvalue)
        
        
    if pattern.match(seriesvalue):
#         print(seriesvalue)
        seriesvalue = pattern.sub(lambda x: spelling_errors[x.group()], seriesvalue)
#         print(seriesvalue)
            
    return seriesvalue

In [None]:
welcomedf['Journalfix2'] = welcomedf['Journal title'].apply(journal_fix)
welcomedf['Journalfix2'].value_counts()

In [None]:


welcomedf['Journalfix1'] = welcomedf['Journal title'].apply(correction)
welcomedf['Journalfix1'].value_counts()

In [None]:
welcomedf.Publisher = welcomedf.Publisher.apply(publisher_fix)