# data.py

A series of functions to manage, manipulate, format and clean data.

## Dependencies

Import and initialise dependencies

In [10]:
from unidecode import unidecode
from spellchecker import SpellChecker
import re
spell = SpellChecker()

## Normalize

A class for normalising data, to allow for consistency and accuracy in data processing, manipulation and output.

The desire is for this class to hold at least three functions for typical functioning:

1. Normalize.string(*string*)
2. Normalize.num(*num*)
3. Normalize.flt(*flt*)

### Normalize.string(*string*)

This function is to normalise string inputs. It does this by carrying out 3 actions on the input:
1. Putting all characters in the string into lower case. This allows py can accurately draw out comparisons between strings.
2. Removing all white space from the string input. This ensures strings are accurate and truly representative of the information they should hold.
3. Uni-decoding the string, to ensure the characters are all ASCII. This increases compatability, reduces file size and increases readability.

### Normalize.num(*num*)

This function is to normalise numbers.

### Normalize.flt(*flt*)

This function is to normalise floats.

In [12]:
class Normalize:

    def string(string):
        #Put all letters to lower case
        string = string.lower()
        
        #Remove Whitespace
        string = string.strip()
        
        #Unidecode string
        string = unidecode(string)

        #Remove punctuation
        string = re.sub(r"[^\w\s]", "", string)
        
        return string
    
    def num(num):
        #CUSTOM NUMBER NORMALISATION FUNCTION HERE
        return num

    def flt(flt):
        #CUSTOM FLOAT NUMBER NORMALISATION FUNCTION HERE
        return flt
    
def test_normalize():
    string = Normalize.string('Ø [Phase]')
    assert string == 'o phase'
    

test_normalize()

o phase
None


## Present

Present is a class used to format output data, ready for reading. This allows for increased legibility, and more accurate and easy use beyond the applications.

### Present.title(*string*)

This function is an expands pythons built-in .title() function to increase readability and to ensure for a more grammatically-correct output. To do this, it performs several different actions:

1. It does not transform articles, conjunctions and prepositions into title case.
2. It checks the title for any mispellings.
3. It changes 'and' to '&' *only* if there is not an ampersand already in the title.

### Present.job_title(*string*)

This function is specifically designed to format an individuals job_title. This achieves this through a variety of means:

1. Unabbreviating commonly-used title abbreviations. *eg, turning 'CEO' into 'Chief Executive Officer'.*
2. 

### Present.name(*string*)

This function is specifically designed to format an individuals names. This is achieved by:

In [None]:
class Present:
    
    def __init__(self):

        #List of articles, conjunctions and prepositions 
        #not to be put into title case:
        self.title_exceptions = ['of', 
                    'and', 
                    'but', 
                    'or', 
                    'for', 
                    'yet', 
                    'so', 
                    'a', 
                    'an', 
                    'the']
        
    def title(self, title):

        #Normalize title
        title = Normalize.string(title)

        #Split words to list
        words = title.split()

        #Spell check
        misspelled = spell.unknown(words)

        #Assume closest spelling is correct
        for word in misspelled:
            i = words.index(str(word))
            words[i] = spell.correction(word)

        #Turn non-exception words in title case
        for word in words:

            #Find corresponding index of word in words list
            i = words.index(str(word))

            #If the word is 'and' and there are no ampersands in the words list, 
            #change the word to ampersand

            if word == 'and' and '&' not in words:
                words[i] = '&'

            #If the word is an exception, leave as lower case.
            if word in self.title_exceptions:
                continue

            #Otherwise, change to title case.
            else:
                words[i] = word.title()

        title = ' '.join(words)
        return title
    
    def job_title(self, job_title):
        return job_title

    def name(self, name):
        return name

## *IN PROGRESS*

The following classes, methods and functions are not yet useable.

## Clean

In [None]:
#Class for cleaning data
class Clean:
    
    def clean():
        return
    
    def remove_bad_rows():
        #Remove Dupes
        #Remove Empty
        #Remove Missing
        return
    
    def fill_missing():
        # Fill missing vals where possible. Use average of column vals.
        return
    
    def dedupe_cols():
        # Remove or merge duplicate columns
        return
    
    def remove_outliers():
        # Remove outliers
        return
    
    def format():
        # Convert data to same format
        # Change column headers
        return
    
    def validate():
        # Check for errors
        # Check range
        return

        
#SubClass for cleaning strings
class CleanString(Clean):
    
    def clean_string():
        return
    
    def remove_html():
        return
    
    def remove_punctuation():
        return
    
    def string():
        return

#SubClass for cleaning strings for Natural Language Processing
class CleanNLP(CleanString):
    
    def clean_nlp():
        return
    
    def lemmatize():
        return
    
    def stem():
        return
    
    def tokenize():
        return
    
    def remove_dupes():
        return
    
    def sort_tokens():
        return
    
    def filter_tokens():
        return