In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

df_train = pd.read_csv('https://media.githubusercontent.com/media/jasonmoxley/StackOverflowPredictor/main/train.csv')
df_valid = pd.read_csv('https://media.githubusercontent.com/media/jasonmoxley/StackOverflowPredictor/main/valid.csv')

In [None]:
#prints the first 5 rows of data frame
df_train.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [None]:
#prints the first 5 rows of data frame
df_valid.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [None]:
#adding spacing before and after each tag to ensure no two words are not seperated by a space when beautifulsoup removes the tags
def add_spacing(data):
  data = data.replace("<", " <")
  data = data.replace(">", "> ")
  return data

#remove excess spacing after beautifulsoup removes tags
def remove_spacing(data):
  data = data.replace("    ", " ")
  data = data.replace("   ", " ")
  data = data.replace("  ", " ")
  data = data.lstrip()
  data = data.rstrip()
  return data

In [None]:
#this function uses the BeautifulSoup library to convert html to text
def htmlToText(html_string):
  soup = bs(html_string)
  #replacing all the code tags with the word code
  for codeTag in soup.findAll('code'):
        codeTag.replaceWith(" code ")
  return soup.get_text()

In [None]:
def tagsToText(tags):
  tags = tags.replace("><", " ")
  tags = tags.replace("<", "")
  tags = tags.replace(">", "")
  tags = tags.lstrip()
  tags = tags.rstrip()
  return tags

In [None]:
#test for html to text functions
test = '''<p>this is a test</p><code>this should be replaced with "code"</code><p>this is a test</p>'''
test = add_spacing(test)
test = htmlToText(test)
test = remove_spacing(test)
print("Two outputs below should be the same")
print("this is a test code this is a test")
print(test)

Two outputs below should be the same
this is a test code this is a test
this is a test code this is a test


In [None]:
test = '''<machine><learning>'''
test = tagsToText(test)
print("Two outputs below should be the same")
print("machine learning")
print(test)

Two outputs below should be the same
machine learning
machine learning


In [None]:
#cleans data using the above functions into a move usable form
def process_data(data):
  #data is a pandas data frame
  data = data.drop(columns=['Id', 'CreationDate'])

  #these lines convert the body column from html to text
  data['Body'] = data['Body'].apply(add_spacing, 1)
  data['Body'] = data['Body'].apply(htmlToText, 1)
  data['Body'] = data['Body'].apply(remove_spacing, 1)

  data['Tags'] = data['Tags'].apply(tagsToText, 1)

  #combining the title and body columns
  data['Post'] = data['Title'] + ' ' + data['Body'] + ' ' + data['Tags']
  #removing any possible excess spacing
  data['Post'] = data['Post'].apply(remove_spacing, 1)
  #removing the title and body columns
  data = data.drop(columns=['Title', 'Body', 'Tags'])

  #converting the labels to numbers
  data['Label'] = data['Y'].map({'HQ':1, 'LQ_EDIT':2, 'LQ_CLOSE':3})
  #removing the old labels column
  data = data.drop(columns=['Y'])

  return data

In [None]:
#processing the training data
df_train = process_data(df_train)
df_train.head()

Unnamed: 0,Post,Label
0,Java: Repeat Task Every Random Seconds I'm alr...,3
1,Why are Java Optionals immutable? I'd like to ...,1
2,Text Overlay Image with Darkened Opacity React...,1
3,Why ternary operator in swift is so picky? The...,1
4,hide/show fab with scale animation I'm using c...,1


In [None]:
#processing the validation data
df_valid = process_data(df_valid)
df_valid.head()

Unnamed: 0,Post,Label
0,How to get all the child records from differen...,2
1,Retrieve all except some data of the another t...,2
2,Pandas: read_html I'm trying to extract US sta...,1
3,"Reader Always gimme NULL I'm so new to C#, I w...",2
4,php rearrange array elements based on conditio...,2


In [None]:
hqWords = {}
lqWords = {}
lqCloseWords = {}
hqWordsList = []
lqWordsList = []
lqCloseWordsList = []

for index, row in df_train.iterrows():
  words = row['Post']
  words = words.split(' ')
  label = row['Label']
  for word in words:
    if label == 1:
      if hqWords.get(word):
        hqWords[word] += 1
      else:
        hqWords[word] = 1
    if label == 2:
      if lqWords.get(word):
        lqWords[word] += 1
      else:
        lqWords[word] = 1
    if label == 3:
      if lqCloseWords.get(word):
        lqCloseWords[word] += 1
      else:
        lqCloseWords[word] = 1

for word in sorted(hqWords, key=hqWords.get, reverse=True):
  hqWordsList.append(word)
  if len(hqWordsList) > 100:
    break
for word in sorted(lqWords, key=lqWords.get, reverse=True):
  lqWordsList.append(word)
  if len(lqWordsList) > 100:
    break
for word in sorted(lqCloseWords, key=lqCloseWords.get, reverse=True):
  lqCloseWordsList.append(word)
  if len(lqCloseWordsList) > 100:
    break

print("these are the 100 most common words for each label")
print("label: HQ")
print(hqWordsList)
print("label: LQ_Edit")
print(lqWordsList)
print("label: LQ_Close")
print(lqCloseWordsList)

these are the 100 most common words for each label
label: HQ
['\n', 'the', 'to', 'I', 'code', 'a', 'in', 'is', 'and', 'of', 'it', 'that', 'this', 'with', 'for', 'have', 'my', 'on', 'but', 'not', 'be', 'can', 'an', 'using', 'as', 'from', 'How', 'am', '\n\n', "I'm", 'like', 'do', 'use', 'when', 'there', 'or', 'The', 'get', 'are', 'if', 'how', 'want', 'which', 'way', 'would', 'Is', 'any', '.', 'all', 'some', 'what', 'by', 'error', 'following', 'file', 'so', 'trying', ',', 'at', 'you', 'What', 'does', 'one', 'tried', 'has', 'i', 'This', 'will', 'only', 'need', 'new', "I've", 'just', 'But', 'run', '-', 'was', 'know', 'app', 'same', 'me', 'python', 'In', 'find', 'should', 'no', 'into', 'set', 'then', 'data', "don't", 'other', 'function', 'android', 'When', ':', 'create', 'javascript', 'make', 'also', 'we']
label: LQ_Edit
['the', 'to', '=', '\r\n', 'I', 'in', 'a', 'is', 'and', '{\r\n', 'of', 'i', 'for', 'it', '}\r\n', 'this', 'have', 'my', 'with', 'if', 'that', 'not', 'on', 'new', 'from', 'bu

In [None]:
#this demonstrates the importance of the tfidf vectorizer
#we are going to try both the count vectorizer and the tfidf vectorizer anyway for logistic regression and naive bayes
#tfidf vectorizer is a combination of the count vectorizer and tfidf term weighting

In [None]:
#seperating the posts and labels of the train data
train_posts = df_train['Post']
train_labels = df_train['Label']

#seperating the posts and labels of the validation data
valid_posts = df_valid['Post']
valid_labels = df_valid['Label']

In [None]:
vectorizor = TfidfVectorizer()
train_posts_tfidf = vectorizor.fit_transform(train_posts)
valid_posts_tfidf = vectorizor.transform(valid_posts)

In [None]:
vectorizor = CountVectorizer()
train_posts_count = vectorizor.fit_transform(train_posts)
valid_posts_count = vectorizor.transform(valid_posts)

In [None]:
lr_result = LogisticRegression(max_iter=4000).fit(train_posts_count, train_labels)
lr_classification_accuracy_count = lr_result.score(valid_posts_count, valid_labels) * 100
print('logistic regression classification accuracy: ' + str(lr_classification_accuracy_count) + '%')

logistic regression classification accuracy: 77.0%


In [None]:
lr_result = LogisticRegression(max_iter=200).fit(train_posts_tfidf, train_labels)
lr_classification_accuracy_tfidf = lr_result.score(valid_posts_tfidf, valid_labels) * 100
print('logistic regression classification accuracy using tf-idf vectors: ' + str(lr_classification_accuracy_tfidf) + '%')

logistic regression classification accuracy using tf-idf vectors: 79.24%


In [None]:
nb_result = MultinomialNB().fit(train_posts_count, train_labels)
nb_classification_accuracy_count = nb_result.score(valid_posts_count, valid_labels) * 100
print('naive bayes classification accuracy: ' + str(nb_classification_accuracy_count) + '%')

naive bayes classification accuracy: 73.7%


In [None]:
nb_result_tfidf = MultinomialNB().fit(train_posts_tfidf, train_labels)
nb_classification_accuracy_tfidf = nb_result_tfidf.score(valid_posts_tfidf, valid_labels) * 100
print('naive bayes classification accuracy using tf-idf vectors: ' + str(nb_classification_accuracy_tfidf) + '%')

naive bayes classification accuracy using tf-idf vectors: 73.33333333333333%


In [None]:
neural_net = MLPClassifier().fit(train_posts_tfidf, train_labels)
neural_net_accuracy = neural_net.score(valid_posts_tfidf, valid_labels) * 100
print('neural net classification accuracy: ' + str(neural_net_accuracy) + '%')

neural net classification accuracy: 73.03333333333333%
