### Mount Google Drive and install import_ipynb

In [1]:
#!pip install import_ipynb

from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create and change directory to workspace folder
WORKING_PATH = '/content/drive/My Drive/Github/ml-team1-july2020'
%cd {WORKING_PATH}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Github/ml-team1-july2020


### Import Dependencies

In [2]:
import sys
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/TagPredictor')
sys.path.append('/content/drive/My Drive/Github/ml-team1-july2020/sandbox/ManualTagger')

# Import component notebooks in other folders
#import import_ipynb

from sandbox.TagPredictor.classifier import Classifier
from sandbox.TagPredictor.classifier_SVM import Classifier_SVM
from sandbox.TagPredictor.multilabelclassifier_SVM import MultilabelClassifier_SVM
from sandbox.TagPredictor.TagPredictor import TagPredictor
from sandbox.ManualTagger.ManualTagger import ManualTagger

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Class Definition

In [23]:
'''
@file       Annotator.ipynb
@date       2020/08/03
@brief      Top level class that defines the annotation tool and active learning algorithm
'''


'''
@brief  NLP classification annotation tool
'''
class Annotator:
    groundTruthDB = None            # Pandas dataframe of all data with ground truth labels
    labeledDB = None                # Pandas dataframe of labeled data
    unlabeledDB = None              # Pandas dataframe of unlabeled data

    tagPredictor = None             # TagPredictor object
    manualTagger = None             # ManualTagger object

    confidenceThreshold = 0.95      # Prediction confidence threshold to determine if a topic should be passed to ManualTagger


    def __init__(self, datafile):
        # Create databases
        self.groundTruthDB, self.labeledDB, self.unlabeledDB = self.createDatabases(datafile)

        # Set up ManualTagger
        self.manualTagger = ManualTagger(self.groundTruthDB)
    

    '''
    @brief      Performs preprocessing and cleaning on a sentence
    @param      text    String that contains the raw sentence
    @return     text    String that contains the cleaned sentence
    '''
    def cleanText(self, text):
        def is_ascii(s):
            return all(ord(c) < 128 for c in s)
        
        # Remove URLs
        text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)

        # Replace newline and tab characters with spaces
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')

        # Convert all letters to lowercase
        text = text.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #text = text.translate(table)

        # Remove all non-ASCII characters
        #text = text.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = text.split()

        # Remove all stop words
        #stop_words = set(stopwords.words('english'))
        #wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct text
        # If it is empty, do not add this sample to the final output
        text = ' '.join(wordList)

        return text


    '''
    @brief      Loads data from CSV files into Pandas dataframes and performs cleanText() on all columns
    @param      datafile        CSV file with all data
    @return     groundTruthDB   Pandas dataframe of all data with ground truth labels
    @return     labeledDB       Pandas dataframe of the labeled data
    @return     unlabeledDB     Pandas dataframe of the unlabeled data
    '''
    def createDatabases(self, datafile):
        # Load CSV file as ground truth database
        groundTruthDB = pd.read_csv(datafile)

        # Combine topic title and leading comment columns
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Topic Title'] + groundTruthDB['Leading Comment']
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

        groundTruthDB = groundTruthDB.drop(columns=['Topic Title', 'Leading Comment', 'Unnamed: 0'])
        
        # Apply cleanText() to all columns with this:
        groundTruthDB['Bag_of_Words'] = groundTruthDB['Bag_of_Words'].apply(lambda x: self.cleanText(x))

        '''
        #create an offset value
        offset = 0
        #the total number of unique comments
        total = len(groundTruthDB)
        for index, entry in enumerate(groundTruthDB['Bag_of_Words']):
            #create a duplicate if post has multiple tags
            tag_list = ast.literal_eval(groundTruthDB.loc[index, 'Tags'])
            text = groundTruthDB.loc[index,'Bag_of_Words']
            while (isinstance(tag_list, list) and len(tag_list) > 1):
                #print(index)
                #sets the tag for the duplicate to a string
                groundTruthDB.loc[total+offset, 'Tags'] = tag_list.pop()
                #Adds the duplicate to the end of the pandas dataframe
                groundTruthDB.loc[total+offset, 'Bag_of_Words'] = text
                offset = offset + 1
            #Changes the first tag to a string
            if (len(tag_list) == 1):
                groundTruthDB.loc[index, 'Tags'] = tag_list.pop()
            #Changes empty tags from lists to strings
            if (isinstance(groundTruthDB.loc[index, 'Tags'], list)):
                groundTruthDB.loc[index, 'Tags'] = ''
                # Not sure why this element is stored as '[]' instead of ''
        '''

        # Filter out topics with no tags
        groundTruthDB = groundTruthDB[groundTruthDB['Tags'].map(len) > 2]

        # Convert Tag column elements from strings to lists
        groundTruthDB['Tags'] = groundTruthDB.Tags.apply(lambda x: x[1:-1].split(','))

        # Split ground truth database into labeled and unlabelled databases
        #mask = np.random.rand(len(groundTruthDB)) < 0.8
        #labeledDB = groundTruthDB[~mask]
        #unlabeledDB = groundTruthDB[mask]['Bag_of_Words']

        groundTruthDB = groundTruthDB.sample(2000)

        unlabeledDB, labeledDB = train_test_split(groundTruthDB, test_size=0.2)
        #unlabeledDB = unlabeledDB['Bag_of_Words']

        return groundTruthDB, labeledDB, unlabeledDB


    '''
    @brief      Demonstration function to run the entire annotator application
    @param      
    @return     None
    '''
    def runApplication(self, classifier):
        # Create multilabel binarizer for metric calculations
        mlb = MultiLabelBinarizer()

        # Set up TagPredictor object
        tagPredictor = TagPredictor(classifier, self.labeledDB)

        # Train tagPredictor
        tagPredictor.train()

        # Predict tags for all unlabeled topics
        tagList, confidenceList = tagPredictor.predict(self.unlabeledDB['Bag_of_Words'])

        # Continue running the active learning loop as long as there are still low-confidence topics
        counter = 1
        print(min(confidenceList))
        print(max(confidenceList))
        while (any(p < self.confidenceThreshold for p in confidenceList) == True):
            # Log tagging statistics
            print('Active Learning Iteration ', counter)
            print('Labeled Database Size: ', len(self.labeledDB))
            print('Unlabeled Database Size: ', len(self.unlabeledDB))
            trueLabelIndicatorMatrix = mlb.fit_transform(self.unlabeledDB['Tags'])
            predictedLabelIndicatorMatrix = mlb.transform(tagList)
            print('Hamming Loss: ', hamming_loss(trueLabelIndicatorMatrix, predictedLabelIndicatorMatrix))
            print('Accuracy: ', accuracy_score(trueLabelIndicatorMatrix, predictedLabelIndicatorMatrix))
            
            # Get low-confidence topic indices
            lowConfIndices = [i for i in range(len(confidenceList)) if confidenceList[i] < self.confidenceThreshold]

            # Pass low-confidence topics to the manual tagger
            lowConfTopics = self.unlabeledDB.iloc[lowConfIndices]
            #print(lowConfIndices)
            #print(lowConfTopics)
            labeledTopics = self.manualTagger.tagTopics(lowConfTopics)

            # Add manually tagged topics to the labeled database
            self.labeledDB = pd.concat([self.labeledDB, labeledTopics], join='inner')

            # Remove tagged topics from unlabeled database
            #self.unlabeledDB = self.unlabeledDB.drop(lowConfTopics)

            cond = self.unlabeledDB['Bag_of_Words'].isin(lowConfTopics['Bag_of_Words'])
            print(len(self.unlabeledDB))
            print(len(lowConfTopics))
            self.unlabeledDB.drop(self.unlabeledDB[cond].index, inplace=True)

            if (len(self.unlabeledDB) == 0):
                break

            # Train tagPredictor with updated database
            tagPredictor = TagPredictor(classifier, self.labeledDB)
            tagPredictor.train()

            # Predict tags for all unlabeled topics
            tagList, confidenceList = tagPredictor.predict(self.unlabeledDB['Bag_of_Words'])

            counter += 1


### Main

In [24]:
# Path to CSV datafile
datafile = '/content/drive/My Drive/Github/ml-team1-july2020/sandbox/Webscraper/StackOverflow_new_tags.csv'

annotator = Annotator(datafile)

annotator.runApplication(MultilabelClassifier_SVM)

Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]
Running SVM Classifier
Finished training
0.9079388586269761
0.9768445579706911
Active Learning Iteration  1
Labeled Database Size:  400
Unlabeled Database Size:  1600
Hamming Loss:  0.04971153846153846
Accuracy:  0.165
1600
1019
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  2
Labeled Database Size:  1419
Unlabeled Database Size:  581
Hamming Loss:  0.04355885078776645
Accuracy:  0.23235800344234078
581
101
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  3
Labeled Database Size:  1520
Unlabeled Database Size:  480
Hamming Loss:  0.040705128205128206
Accuracy:  0.2625
480
55
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  4
Labeled Database Size:  1575
Unlabeled Database Size:  425
Hamming Loss:  0.039095022624434386
Accuracy:  0.26823529411764707
425
26
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  5
Labeled Database Size:  1601
Unlabeled Database Size:  399
Hamming Loss:  0.038172353961827644
Accuracy:  0.2656641604010025
399
27
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  6
Labeled Database Size:  1628
Unlabeled Database Size:  372
Hamming Loss:  0.037944582299421006
Accuracy:  0.2903225806451613
372
10
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  7
Labeled Database Size:  1638
Unlabeled Database Size:  362
Hamming Loss:  0.038567785805354866
Accuracy:  0.27624309392265195
362
9
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier
Finished training
Active Learning Iteration  8
Labeled Database Size:  1647
Unlabeled Database Size:  353
Hamming Loss:  0.03802571366310743
Accuracy:  0.2776203966005666
353
3
Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Running SVM Classifier


KeyboardInterrupt: ignored

### All test code below here

In [25]:
# Set up TagPredictor object
tagPredictor = TagPredictor(MultilabelClassifier_SVM, annotator.labeledDB)

# Train tagPredictor
tagPredictor.train()



Initialized TagPredictor
Started training
[" 'beautifulsoup'" " 'nlp'" " 'nltk'" " 'scrapy'" " 'selenium-webdriver'"
 " 'sentiment-analysis'" " 'splinter'" " 'text-classification'"
 " 'text-mining'" " 'tf-idf'" " 'web-scraping'" " 'word-embedding'"
 "'beautifulsoup'" "'nlp'" "'nltk'" "'scikit-learn'" "'scrapy'"
 "'selenium'" "'selenium-webdriver'" "'sentiment-analysis'" "'splinter'"
 "'text-classification'" "'text-mining'" "'tf-idf'" "'web-scraping'"
 "'word-embedding'"]
Running SVM Classifier
Finished training


In [26]:
# Predict tags for all unlabeled topics
tagList, confidenceList = tagPredictor.predict(annotator.unlabeledDB['Bag_of_Words'])
print(tagList)
print(confidenceList)

[("'text-classification'",), (), (), (), (), ("'selenium'",), (" 'selenium-webdriver'", "'selenium'"), (), (), ("'scikit-learn'",), (), ("'sentiment-analysis'",), (), ("'sentiment-analysis'",), (), (), ("'scikit-learn'",), (" 'selenium-webdriver'", "'selenium'"), (" 'selenium-webdriver'", "'selenium'"), (), (), (" 'selenium-webdriver'", "'selenium'"), (), ("'scikit-learn'",), (), (), ("'text-classification'",), (), (), (), (), (), (), ("'scrapy'",), (), (), (), (), (" 'splinter'", "'splinter'"), ("'nlp'",), (), ("'sentiment-analysis'",), ("'sentiment-analysis'",), (), (), (" 'selenium-webdriver'", "'selenium'"), ("'scrapy'",), ("'nltk'",), ("'tf-idf'",), (), (" 'selenium-webdriver'", "'selenium'"), (), (" 'sentiment-analysis'", "'sentiment-analysis'"), (), (), (), (), (), (), (), ("'nltk'",), (), (), (), (), (" 'selenium-webdriver'", "'selenium'"), (), (), (), ("'scrapy'",), (), ("'scikit-learn'",), (), ("'scikit-learn'",), ("'scikit-learn'",), (), (), (" 'word-embedding'", "'nlp'", "'

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

trueLabelIndicatorMatrix = mlb.fit_transform(annotator.unlabeledDB['Tags'])
predictedLabelIndicatorMatrix = mlb.transform(tagList)

from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
print(hamming_loss(trueLabelIndicatorMatrix, predictedLabelIndicatorMatrix))
print(accuracy_score(trueLabelIndicatorMatrix, predictedLabelIndicatorMatrix))

0.03802197802197802
0.28


In [None]:
print(trueLabelIndicatorMatrix.shape)
print(predictedLabelIndicatorMatrix.shape)

print(max(confidenceList))
print(min(confidenceList))
print(len(tagList))
print(len(annotator.groundTruthDB))

In [None]:
from operator import itemgetter 
import numpy as np
a = ['abc', 'def', 'ghi', 'this', 'is', 'great']
b = np.array([0,1,0,1,0,1])
c = [i for i in range(len(b)) if b[i] == 1]
print([a[i] for i in range(len(b)) if b[i] == 1])
print(itemgetter(*c)(a))