### Mount Google Drive and install import_ipynb

In [1]:
#!pip install import_ipynb

from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create and change directory to workspace folder
WORKING_PATH = '/content/drive/My Drive/Github/ml-team1-july2020'
%cd {WORKING_PATH}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Github/ml-team1-july2020


### Import Dependencies

In [2]:
import sys
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/TagPredictor')
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/ManualTagger')

# Import component notebooks in other folders
#import import_ipynb

from sandbox.TagPredictor.classifier import Classifier
from sandbox.TagPredictor.classifier_SVM import Classifier_SVM
from sandbox.TagPredictor.multilabelclassifier_SVM import MultilabelClassifier_SVM
from sandbox.TagPredictor.TagPredictor import TagPredictor
from sandbox.ManualTagger.ManualTagger import ManualTagger

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Class Definition

In [3]:
'''
@file       Annotator.ipynb
@date       2020/08/03
@brief      Top level class that defines the annotation tool and active learning algorithm
'''


'''
@brief  NLP classification annotation tool
'''
class Annotator:
    groundTruthDB = None            # Pandas dataframe of all data with ground truth labels
    labeledDB = None                # Pandas dataframe of labeled data
    unlabeledDB = None              # Pandas dataframe of unlabeled data

    tagPredictor = None             # TagPredictor object
    manualTagger = None             # ManualTagger object

    confidenceThreshold = 0.8       # Prediction confidence threshold to determine if a topic should be passed to ManualTagger


    def __init__(self, datafile):
        # Create databases
        self.groundTruthDB, self.labeledDB, self.unlabeledDB = self.createDatabases(datafile)

        # Set up ManualTagger
        #manualTagger = manualTagger
    

    '''
    @brief      Performs preprocessing and cleaning on a sentence
    @param      text    String that contains the raw sentence
    @return     text    String that contains the cleaned sentence
    '''
    def cleanText(self, text):
        def is_ascii(s):
            return all(ord(c) < 128 for c in s)
        
        # Remove URLs
        text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)

        # Replace newline and tab characters with spaces
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')

        # Convert all letters to lowercase
        text = text.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #text = text.translate(table)

        # Remove all non-ASCII characters
        #text = text.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = text.split()

        # Remove all stop words
        #stop_words = set(stopwords.words('english'))
        #wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct text
        # If it is empty, do not add this sample to the final output
        text = ' '.join(wordList)

        return text


    '''
    @brief      Loads data from CSV files into Pandas dataframes and performs cleanText() on all columns
    @param      datafile        CSV file with all data
    @return     groundTruthDB   Pandas dataframe of all data with ground truth labels
    @return     labeledDB       Pandas dataframe of the labeled data
    @return     unlabeledDB     Pandas dataframe of the unlabeled data
    '''
    def createDatabases(self, datafile):
        # Load CSV file as ground truth database
        groundTruthDB = pd.read_csv(datafile)

        # Combine topic title and leading comment columns
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Topic Title'] + groundTruthDB['Leading Comment']
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

        groundTruthDB = groundTruthDB.drop(columns=['Topic Title', 'Leading Comment', 'Unnamed: 0'])
        
        # Apply cleanText() to all columns with this:
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].apply(lambda x: self.cleanText(x))

        '''
        #create an offset value
        offset = 0
        #the total number of unique comments
        total = len(groundTruthDB)
        for index, entry in enumerate(groundTruthDB['Bag_of_Words']):
            #create a duplicate if post has multiple tags
            tag_list = ast.literal_eval(groundTruthDB.loc[index, 'Tags'])
            text = groundTruthDB.loc[index,'Bag_of_Words']
            while (isinstance(tag_list, list) and len(tag_list) > 1):
                #print(index)
                #sets the tag for the duplicate to a string
                groundTruthDB.loc[total+offset, 'Tags'] = tag_list.pop()
                #Adds the duplicate to the end of the pandas dataframe
                groundTruthDB.loc[total+offset, 'Bag_of_Words'] = text
                offset = offset + 1
            #Changes the first tag to a string
            if (len(tag_list) == 1):
                groundTruthDB.loc[index, 'Tags'] = tag_list.pop()
            #Changes empty tags from lists to strings
            if (isinstance(groundTruthDB.loc[index, 'Tags'], list)):
                groundTruthDB.loc[index, 'Tags'] = ''
                # Not sure why this element is stored as '[]' instead of ''
        '''

        # Filter out topics with no tags
        groundTruthDB = groundTruthDB[groundTruthDB['Tags'].map(len) > 2]

        # Convert Tag column elements from strings to lists
        groundTruthDB['Tags'] = groundTruthDB.Tags.apply(lambda x: x[1:-1].split(','))

        # Split ground truth database into labeled and unlabelled databases
        #mask = np.random.rand(len(groundTruthDB)) < 0.8
        #labeledDB = groundTruthDB[~mask]
        #unlabeledDB = groundTruthDB[mask]['Bag_of_Words']

        unlabeledDB, labeledDB = train_test_split(groundTruthDB, test_size=0.2)
        unlabeledDB = unlabeledDB['Bag_of_Words']

        return groundTruthDB, labeledDB, unlabeledDB


    '''
    @brief      Demonstration function to run the entire annotator application
    @param      
    @return     None
    '''
    def runApplication(self, classifier):
        # Set up TagPredictor object
        tagPredictor = TagPredictor(classifier, self.labeledDB)

        # Train tagPredictor
        tagPredictor.train()

        # Predict tags for all unlabeled topics
        tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)

        # Continue running the active learning loop as long as there are still low-confidence topics
        while (any(p < self.confidenceThreshold for p in confidenceList) == True):
            # Log tagging statistics
            
            # Get low-confidence topic indices
            lowConfIndices = [i for i in range(len(L)) if confidenceList[i] < self.confidenceThreshold]

            # Pass low-confidence topics to the manual tagger
            lowConfTopics = self.unlabelDB.iloc(lowConfIndices)
            labeledTopics = self.manualTagger.run(lowConfTopics)

            # Add manually tagged topics to the labeled database
            self.labeledDB = pd.concat([self.labeledDB, labeledTopics], join='inner')

            # Remove tagged topics from unlabeled database
            self.unlabeledDB = self.unlabeledDB.drop(lowConfTopics)

            # Train tagPredictor with updated database
            tagPredictor = TagPredictor(classifier, self.labeledDB)
            tagPredictor.train()

            # Predict tags for all unlabeled topics
            tagList, confidenceList = tagPredictor.predict(self.unlabeledDB)




if __name__ == '__main__':
    # Path to CSV datafile
    datafile = '/content/drive/My Drive/Github/ml-team1-july2020/sandbox/Webscraper/StackOverflow_new_tags.csv'

    annotator = Annotator(datafile)

    #print(annotator.groundTruthDB)

    #text = annotator.groundTruthDB.iloc[96]['Bag_of_Words']
    #print(text)
    #print(annotator.cleanText(text))


    


In [4]:
# Set up TagPredictor object
tagPredictor = TagPredictor(MultilabelClassifier_SVM, annotator.labeledDB)

# Train tagPredictor
tagPredictor.train()



Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]
Running SVM Classifier
Finished training


In [5]:
# Predict tags for all unlabeled topics
tagList, confidenceList = tagPredictor.predict(annotator.unlabeledDB)
print(tagList)
print(confidenceList)

[(), (), (), (), (), (), (), (), (), (), (), (), ("'nlp'",), (), ("'web-scraping'",), (), (), ("'scrapy'",), (), ("'nlp'",), (), (), (), (), (), ("'selenium'",), ("'nlp'",), (), (), (), (), (), ("'selenium'",), ("'scikit-learn'",), (), (), (), (), (), (), ("'scikit-learn'",), (), (), ("'nlp'",), ("'selenium'",), ("'scrapy'", "'web-scraping'"), (), ("'web-scraping'",), (), (), (), (), ("'scikit-learn'",), ("'nlp'",), (), (), ("'selenium'",), (), (), (), (), (), (), (), (), ("'selenium'",), ("'scikit-learn'", "'tf-idf'"), ("'selenium'",), (), (), ("'selenium'",), (), (), (), (), (), ("'nlp'",), (), (), (), (), (), (), ("'scikit-learn'",), ("'nlp'",), (), (), (), (), ("'web-scraping'",), ("'selenium'",), (), (), (), (), ("'web-scraping'",), (), (), (), (), (), (), (" 'word-embedding'", "'nlp'"), (), ("'scikit-learn'",), (), (), (), (), ("'selenium'",), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ("'nlp'",), (), (), ("'scikit-learn'",), (), (), (), (), (), ("'nlp'",), ("'nl

In [7]:
print(np.array(confidenceList).shape)

(26, 5804, 2)
