### Import Dependencies

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import component classes (TagPredictor and ManualTagger)
from TagPredictor import TagPredictor
from ManualTagger import ManualTagger

### Class Definition

In [None]:
'''
@file       Annotator.ipynb
@date       2020/08/03
@brief      Top level class that defines the annotation tool and active learning algorithm
'''


'''
@brief  NLP classification annotation tool
'''
class Annotator:
    labeledDB = None
    unlabelDB = None


    def __init__(self, labeledDatafile, unlabeledDatafile):
        # Create labeled and unlabeled databases
        self.labeledDB, self.unlabeledDB = self.createDatabases(labeledDatafile, unlabeledDatafile)
    

    '''
    @brief      Performs preprocessing and cleaning on a sentence
    @param      text    String that contains the raw sentence
    @return     text    String that contains the cleaned sentence
    '''
    def cleanText(self, text):
        ## Change all instance of featureString to text

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)

        return text


    '''
    @brief      Loads data from CSV files into Pandas dataframes and performs cleanText() on all columns
    @param      labeledDatafile     Labeled data CSV file
    @param      unlabeledDatafile   Unlabeled data CSV file
    @return     labeledDB           Pandas dataframe of the labeled data
    @return     unlabeledDB         Pandas dataframe of the unlabeled data
    '''
    def createDatabases(self, labeledDatafile, unlabeledDatafile):
        # Load CSV files as Pandas dataframes

        # Combine topic title and leading comment columns

        # Apply cleanText() to all columns with this:
        dataframe['Column Name'] = dataframe['Column Name'].apply(lambda x: cleanText(x))


    '''
    @brief      Demonstration function to run the entire annotator application
    @param      
    @return     None
    '''
    def runApplication(self, classifier):
        # Create labeled and unlabeled databases
        self.labeledDB, self.unlabeledDB = self.createDatabases(labeledDatafile, unlabeledDatafile)

        # Train tagPredictor

        # Predict tags for all unlabeled topics

        # Continue running the active learning loop as long as there are still low-confidence topics
            # Log tagging statistics
            
            # Pass low-confidence topics to the manual tagger

            # Add manually tagged topics to the labeled database

            # Train tagPredictor

            # Predict tags for all unlabeled topics




if __name__ == '__main__':
    pass
