### Mount Google Drive and install import_ipynb

In [2]:
!pip install import_ipynb

from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create and change directory to workspace folder
WORKING_PATH = '/content/drive/My Drive/Github/ml-team1-july2020'
%cd {WORKING_PATH}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Github/ml-team1-july2020


### Import Dependencies

In [13]:
import sys
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/TagPredictor')
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/ManualTagger')

# Import component notebooks in other folders
import import_ipynb
from sandbox.TagPredictor.TagPredictor import TagPredictor
from sandbox.ManualTagger.ManualTagger import ManualTagger

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Class Definition

In [15]:
'''
@file       Annotator.ipynb
@date       2020/08/03
@brief      Top level class that defines the annotation tool and active learning algorithm
'''


'''
@brief  NLP classification annotation tool
'''
class Annotator:
    groundTruthDB = None            # Pandas dataframe of all data with ground truth labels
    labeledDB = None                # Pandas dataframe of labeled data
    unlabelDB = None                # Pandas dataframe of unlabeled data

    tagPredictor = None             # TagPredictor object
    manualTagger = None             # ManualTagger object

    confidenceThreshold = 0.8       # Prediction confidence threshold to determine if a topic should be passed to ManualTagger


    def __init__(self, datafile):
        # Create databases
        self.groundTruthDB, self.labeledDB, self.unlabeledDB = self.createDatabases(datafile)

        # Set up ManualTagger
        #manualTagger = manualTagger
    

    '''
    @brief      Performs preprocessing and cleaning on a sentence
    @param      text    String that contains the raw sentence
    @return     text    String that contains the cleaned sentence
    '''
    def cleanText(self, text):
        # Replace newline and tab characters with spaces
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')

        # Convert all letters to lowercase
        text = text.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #text = text.translate(table)

        # Remove all non-ASCII characters
        #text = text.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = text.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct text
        # If it is empty, do not add this sample to the final output
        text = ' '.join(wordList)

        return text


    '''
    @brief      Loads data from CSV files into Pandas dataframes and performs cleanText() on all columns
    @param      datafile        CSV file with all data
    @return     groundTruthDB   Pandas dataframe of all data with ground truth labels
    @return     labeledDB       Pandas dataframe of the labeled data
    @return     unlabeledDB     Pandas dataframe of the unlabeled data
    '''
    def createDatabases(self, datafile):
        # Load CSV file as ground truth database
        groundTruthDB = pd.read_csv(datafile)

        # Combine topic title and leading comment columns
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Topic Title'] + groundTruthDB['Leading Comment']
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

        groundTruthDB = groundTruthDB.drop(columns=['Topic Title', 'Leading Comment'])
        
        # Apply cleanText() to all columns with this:
        #groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].apply(lambda x: cleanText(x))

        # Split ground truth database into labeled and unlabelled databases
        mask = np.random.rand(len(groundTruthDB)) < 0.8

        labeledDB = groundTruthDB[~mask]
        unlabeledDB = groundTruthDB[mask]['Bag_of_Words']

        return groundTruthDB, labeledDB, unlabeledDB


    '''
    @brief      Demonstration function to run the entire annotator application
    @param      
    @return     None
    '''
    def runApplication(self, classifier):
        # Set up TagPredictor object
        tagPredictor = TagPredictor(classifier, self.labeledDB)

        # Train tagPredictor
        tagPredictor.train()

        # Predict tags for all unlabeled topics
        tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)

        # Continue running the active learning loop as long as there are still low-confidence topics
        while (any(p < self.confidenceThreshold for p in confidenceList) == True):
            # Log tagging statistics
            
            # Get low-confidence topic indices
            lowConfIndices = [i for i in range(len(L)) if confidenceList[i] < self.confidenceThreshold]

            # Pass low-confidence topics to the manual tagger
            lowConfTopics = self.unlabelDB.iloc(lowConfIndices)
            labeledTopics = self.manualTagger.run(lowConfTopics)

            # Add manually tagged topics to the labeled database
            self.labeledDB = pd.concat([self.labeledDB, labeledTopics], join='inner')

            # Remove tagged topics from unlabeled database
            self.unlabeledDB = self.unlabeledDB.drop(lowConfTopics)

            # Train tagPredictor with updated database
            tagPredictor = TagPredictor(classifier, self.labeledDB)
            tagPredictor.train()

            # Predict tags for all unlabeled topics
            tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)




if __name__ == '__main__':
    # Path to CSV datafile
    datafile = '/content/drive/My Drive/Github/ml-team1-july2020/sandbox/Webscraper/StackOverflow_new_tags.csv'

    annotator = Annotator(datafile)
    print(annotator.groundTruthDB)

    


      Unnamed: 0                 Tags         Bag_of_Words
0              0              ['nlp']  spaCy strange be...
1              1      ['nlp', 'nltk']  number of tokeni...
2              2              ['nlp']  what is the mean...
3              3      ['nlp', 'nltk']  How to validate ...
4              4              ['nlp']  Create a referen...
5              5              ['nlp']  The size of tens...
6              6              ['nlp']  Explicit likelih...
7              7              ['nlp']  Clean corpus usi...
8              8              ['nlp']  Can't find model...
9              9              ['nlp']  Provide a step-b...
10            10              ['nlp']  NLP: Compare the...
11            11      ['nlp', 'nltk']  How to extract o...
12            12      ['nlp', 'nltk']  Best way to dete...
13            13              ['nlp']  How can I includ...
14            14              ['nlp']  Loading pretrain...
15            15  ['web-scraping',...  Machine Learning.