### Mount Google Drive and install import_ipynb

In [1]:
!pip install import_ipynb

from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create and change directory to workspace folder
WORKING_PATH = '/content/drive/My Drive/Github/ml-team1-july2020'
%cd {WORKING_PATH}

Collecting import_ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=39d22cb39d27995e1e8afb6fabfc67fdd1e9a7d5483d0267db5afa3fa84bf652
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%

### Import Dependencies

In [4]:
import sys
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/TagPredictor')
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/ManualTagger')

# Import component notebooks in other folders
import import_ipynb
from sandbox.TagPredictor.TagPredictor import TagPredictor
from sandbox.ManualTagger.ManualTagger import ManualTagger

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

'''
# Import component classes
from TagPredictor import TagPredictor
from ManualTagger import ManualTagger
from classifier_NB import Classifier_NB
from classifier_SVM import Classifier_SVM
'''

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'\n# Import component classes\nfrom TagPredictor import TagPredictor\nfrom ManualTagger import ManualTagger\nfrom classifier_NB import Classifier_NB\nfrom classifier_SVM import Classifier_SVM\n'

### Class Definition

In [None]:
'''
@file       Annotator.ipynb
@date       2020/08/03
@brief      Top level class that defines the annotation tool and active learning algorithm
'''


'''
@brief  NLP classification annotation tool
'''
class Annotator:
    labeledDB = None                # Pandas dataframe of labeled data
    unlabelDB = None                # Pandas dataframe of unlabeled data

    tagPredictor = None             # TagPredictor object
    manualTagger = None             # ManualTagger object

    confidenceThreshold = 0.8       # Prediction confidence threshold to determine if a topic should be passed to ManualTagger


    def __init__(self, labeledDatafile, unlabeledDatafile, manualTagger):
        # Create labeled and unlabeled databases
        self.labeledDB, self.unlabeledDB = self.createDatabases(labeledDatafile, unlabeledDatafile)

        # Set up ManualTagger
        manualTagger = manualTagger
    

    '''
    @brief      Performs preprocessing and cleaning on a sentence
    @param      text    String that contains the raw sentence
    @return     text    String that contains the cleaned sentence
    '''
    def cleanText(self, text):
        ## Change all instance of featureString to text

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)

        return text


    '''
    @brief      Loads data from CSV files into Pandas dataframes and performs cleanText() on all columns
    @param      labeledDatafile     Labeled data CSV file
    @param      unlabeledDatafile   Unlabeled data CSV file
    @return     labeledDB           Pandas dataframe of the labeled data
    @return     unlabeledDB         Pandas dataframe of the unlabeled data
    '''
    def createDatabases(self, labeledDatafile, unlabeledDatafile):
        # Load CSV files as Pandas dataframes
        labeledDB = pd.read_csv(labeledDatafile)
        unlabeledDB = pd.read_csv(unlabeledDatafile)
        # Combine topic title and leading comment columns
        unlabeledDB['Bag_of_words'] = unlabeledDB['Topic Title'] + unlabeledDB['Leading Comment']
        unlabeledDB['Bag_of_words'] = unlabeledDB['Bag_of_words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')
        
        labeledDB['Bag_of_words'] = labeledDB['Topic Title'] + labeledDB['Leading Comment']
        labeledDB['Bag_of_words'] = labeledDB['Bag_of_words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')
        # Apply cleanText() to all columns with this:
        unlabeledDB['Bag_of_words'] = unlabeledDB['Bag_of_words'].apply(lambda x: cleanText(x))
        labeledDB['Bag_of_words'] = labeledDB['Bag_of_words'].apply(lambda x: cleanText(x))
        
        return labeledDB, unlabeledDB


    '''
    @brief      Demonstration function to run the entire annotator application
    @param      
    @return     None
    '''
    def runApplication(self, classifier):
        # Create labeled and unlabeled databases
        self.labeledDB, self.unlabeledDB = self.createDatabases(labeledDatafile, unlabeledDatafile)

        # Set up TagPredictor object
        tagPredictor = TagPredictor(classifier, self.labeledDB)

        # Train tagPredictor
        tagPredictor.train()

        # Predict tags for all unlabeled topics
        tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)

        # Continue running the active learning loop as long as there are still low-confidence topics
        while (any(p < self.confidenceThreshold for p in confidenceList) == True):
            # Log tagging statistics
            
            # Get low-confidence topic indices
            lowConfIndices = [i for i in range(len(L)) if confidenceList[i] < self.confidenceThreshold]

            # Pass low-confidence topics to the manual tagger
            lowConfTopics = self.unlabelDB.iloc(lowConfIndices)
            labeledTopics = self.manualTagger.run(lowConfTopics)

            # Add manually tagged topics to the labeled database
            self.labeledDB = pd.concat([self.labeledDB, labeledTopics], join='inner')

            # Remove tagged topics from unlabeled database
            self.unlabeledDB = self.unlabeledDB.drop(lowConfTopics)

            # Train tagPredictor with updated database
            tagPredictor = TagPredictor(classifier, self.labeledDB)
            tagPredictor.train()

            # Predict tags for all unlabeled topics
            tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)




if __name__ == '__main__':
    nb = Classifier_NB()
    svm = Classifier_SVM()

    # Set up Manual Tagger with ground truth database
    #...

    
