In [1]:
import pandas as pd
import numpy as np
import copy
import ast
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

%run classifier_NB.ipynb
%run classifier_SVM.ipynb

# code taken from https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

class TagPredictor:
    def __init__(self, classifier, corpus, categories):
        self.classifier = classifier
        self.corpus = corpus
        self.categories = categories

        np.random.seed(500)

        print("Initialized TagPredictor")
        
        
    def train(self):
        print("Started training")
        
        # Filter out unused categories
        self.corpus['Tags'] = self.corpus['Tags'].map(lambda tags : list(filter(lambda tag : tag in categories, ast.literal_eval(tags))))
        
        # Transform tags to multilabel format
        self.mlb = MultiLabelBinarizer()
        Y_matrix = self.mlb.fit_transform(self.corpus['Tags'])
        #np.set_printoptions(threshold=np.inf)
        #print(matrix[0])
        
        train, test, Train_Y, Test_Y = train_test_split(self.corpus, Y_matrix, test_size=0.3, shuffle=True)
        Train_X = train['text_final']
        Test_X = test['text_final']
        #print(Train_X)
        #print(Train_Y)
        

        self.Tfidf_vect = TfidfVectorizer(max_features=5000)
        self.Tfidf_vect.fit(self.corpus['text_final'])
        Train_X_Tfidf = self.Tfidf_vect.transform(Train_X)
        Test_X_Tfidf = self.Tfidf_vect.transform(Test_X)

        self.model = self.classifier()
        self.model.train(Train_X_Tfidf, Train_Y, Test_X_Tfidf, Test_Y)
        
        print("Finished training")
        
        
    def predict(self, df):
        # return predictions_df, confidence_level
        X = df['text_final']
        X_Tfidf = self.Tfidf_vect.transform(X)
        matrix = self.model.predict(X_Tfidf)
        labels = self.mlb.inverse_transform(matrix)
        return labels
        
"""
def main():
    nb = Classifier_NB()
    svm = Classifier_SVM()

    # read the data frame from csv (change the path for your local machine)
    corpus = pd.read_csv(r"/Users/maxim/dev/STEM-Away/ml-team1-july2020/sandbox/webscraper/StackOverflow.csv", engine='python')
    print(corpus.shape)
    print(corpus.columns)
    
    # tagPredictor = TagPredictor(nb, corpus)
    #tagPredictor = TagPredictor(svm, corpus)

    #tagPredictor.preprocess()
    #tagPredictor.train()

if __name__ == '__main__':
    main()
"""

'\ndef main():\n    nb = Classifier_NB()\n    svm = Classifier_SVM()\n\n    # read the data frame from csv (change the path for your local machine)\n    corpus = pd.read_csv(r"/Users/maxim/dev/STEM-Away/ml-team1-july2020/sandbox/webscraper/StackOverflow.csv", engine=\'python\')\n    print(corpus.shape)\n    print(corpus.columns)\n    \n    # tagPredictor = TagPredictor(nb, corpus)\n    #tagPredictor = TagPredictor(svm, corpus)\n\n    #tagPredictor.preprocess()\n    #tagPredictor.train()\n\nif __name__ == \'__main__\':\n    main()\n'

In [2]:
# Preprocess

def preprocess(df):
        print("Started preprocessing")

        # Step - a : Remove blank rows if any.
        df['Topic Title'].dropna(inplace=True)
        # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
        df['Topic Title'] = [entry.lower() for entry in df['Topic Title']]
        # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
        df['Topic Title'] = [word_tokenize(entry) for entry in df['Topic Title']]
        # Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
        # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
        tag_map = defaultdict(lambda : wn.NOUN)
        tag_map['J'] = wn.ADJ
        tag_map['V'] = wn.VERB
        tag_map['R'] = wn.ADV
        for index,entry in enumerate(df['Topic Title']):
            # Declaring Empty List to store the words that follow the rules for this step
            Final_words = []
            # Initializing WordNetLemmatizer()
            word_Lemmatized = WordNetLemmatizer()
            # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
            for word, tag in pos_tag(entry):
                # Below condition is to check for Stop words and consider only alphabets
                if word not in stopwords.words('english') and word.isalpha():
                    word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                    Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
            df.loc[index,'text_final'] = str(Final_words)

        print("Finished preprocessing")
        
        return df

In [3]:
# Main

# read the data frame from csv (change the path for your local machine)
corpus = pd.read_csv(r"/Users/maxim/dev/STEM-Away/ml-team1-july2020/sandbox/webscraper/StackOverflow.csv", engine='python')
corpus = preprocess(corpus)

Started preprocessing
Finished preprocessing


In [4]:
categories = ['nlp', 'nltk', 'bert', 'word-embedding','text-classification','data-augmentation', 'sentiment-analysis', 'tf-idf', 'scikit-learn', 'feature-extraction', 'text-mining']
    
# tagPredictor = TagPredictor(Classifier_NB, corpus, categories)
tagPredictor = TagPredictor(Classifier_SVM, corpus, categories)

Initialized TagPredictor


In [5]:
tagPredictor.train()

Started training
Running SVM Classifier
SVM Accuracy Score ->  28.42548076923077
Finished training


In [7]:
corpus = pd.read_csv(r"/Users/maxim/dev/STEM-Away/ml-team1-july2020/sandbox/webscraper/StackOverflow.csv", engine='python')
test_df = preprocess(corpus)
tagPredictor.predict(test_df)

Started preprocessing
Finished preprocessing


      Unnamed: 0                                        Topic Title  \
0              0  [valueerror, using, fasttext, trained, bin, mo...   
1              1  [rnn, language, model, in, pytorch, predicting...   
2              2  [add, additional, layers, to, the, huggingface...   
3              3  [topic, score/weight, varies, for, seen, text,...   
4              4  [how, should, i, load, a, large, nlp, file, in...   
...          ...                                                ...   
5541        5541  [extract, variations, of, a, string, from, r, ...   
5542        5542  [how, to, build, a, subject-verb-object, extra...   
5543        5543  [naive, bayes, model, not, predicting, anythin...   
5544        5544      [how, to, read, csv, file, for, text, mining]   
5545        5545  [keep, only, sentences, in, corpus, that, cont...   

                                                   Tags  \
0     ['machine-learning', 'nlp', 'unsupervised-lear...   
1     ['machine-learning', 'n

[(),
 (),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 ('scikit-learn',),
 ('nlp',),
 ('nlp',),
 (),
 ('nlp',),
 (),
 (),
 (),
 (),
 (),
 (),
 ('nlp',),
 (),
 ('nlp',),
 (),
 (),
 ('nlp',),
 (),
 ('nlp',),
 (),
 ('nlp',),
 (),
 ('nlp',),
 (),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 (),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 (),
 (),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 (),
 (),
 ('nlp',),
 ('nlp',),
 (),
 ('text-mining',),
 (),
 ('nlp',),
 (),
 ('sentiment-analysis',),
 ('nlp',),
 ('nlp',),
 (),
 ('nlp',),
 (),
 (),
 ('bert',),
 (),
 ('nlp',),
 ('bert', 'nlp'),
 (),
 (),
 (),
 (),
 (),
 ('nlp',),
 ('nltk',),
 ('nlp',),
 (),
 ('nlp',),
 ('nlp',),
 ('feature-extraction',),
 (),
 (),
 ('bert',),
 ('nlp',),
 ('bert', 'nlp'),
 ('nlp',),
 (),
 (),
 (),
 ('text-classification',),
 (),
 (),
 (),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 ('nlp',),
 (),
 (),
 (),
 ('nlp', 'nltk'),
 ('nlp',),
 (),
 (),
 (),
 (),
 (),
 (),
 ('nlp', 'word-embedding'),
 ('nlp',),
 (),
 (),
 (),
 (),
 ('nlp',),
 ('nl