In [2]:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://stackoverflow.com/questions")
soup = BeautifulSoup(page.text, 'html.parser')

In [3]:
p=soup.select(".question-summary")
for i in p:
    a=i.select_one('.question-hyperlink').getText()

In [4]:
import nltk
import ast
nltk.download('stopwords')
from nltk.corpus import stopwords
from ast import literal_eval
import pandas as pd
import numpy as np
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def literal_return(val):
    try:
        return ast.literal_eval(val)
    except ValueError:
        return (val)

In [6]:
train = pd.read_csv('train.tsv',sep="\t")
train['tags'] = train['tags'].apply(literal_return)
validation=pd.read_csv('validation.tsv',sep="\t")
validation['tags']=validation['tags'].apply(literal_return)
test=pd.read_csv('test.tsv',sep="\t")
print(train.head())

                                               title                  tags
0                How to draw a stacked dotplot in R?                   [r]
1  mysql select all records where a datetime fiel...          [php, mysql]
2             How to terminate windows phone 8.1 app                  [c#]
3  get current time in a specific country via jquery  [javascript, jquery]
4                      Configuring Tomcat to Use SSL                [java]


In [7]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [8]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def text_prepare(text):

    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE," ",text)
    text = re.sub(BAD_SYMBOLS_RE,"",text)
    text = " ".join([s for s in text.split(" ") if s not in STOPWORDS])
    return text

In [9]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [10]:
y_train

array([list(['r']), list(['php', 'mysql']), list(['c#']), ...,
       list(['c#', 'asp.net-mvc']), list(['javascript', 'jquery']),
       list(['python'])], dtype=object)

In [11]:
from nltk.tokenize.treebank import TreebankWordTokenizer
twd = TreebankWordTokenizer()
twd.tokenize(X_train[0])

['draw', 'stacked', 'dotplot', 'r']

In [12]:
tags_counts = {}
words_counts = {}
all_tags = []
vocabulary=[]
from collections import Counter
from nltk.tokenize.treebank import TreebankWordTokenizer
twd = TreebankWordTokenizer()

for tag in y_train:
    all_tags.extend(tag)

for txt in X_train:
    vocabulary.extend(twd.tokenize(txt))

tags_counts = Counter(all_tags)
words_counts = Counter(vocabulary)

In [13]:
import operator

TOP_WORDS=sorted_d = sorted(words_counts.items(), key=operator.itemgetter(1),reverse=True)[:5000]

In [14]:
TOP_WORDS

[('using', 4395),
 ('c', 3206),
 ('php', 3004),
 ('java', 2922),
 ('file', 2697),
 ('#', 2487),
 ('javascript', 2382),
 ('error', 2333),
 ('get', 2214),
 ('python', 2210),
 ('string', 1960),
 ('array', 1822),
 ('data', 1738),
 ('jquery', 1709),
 ('value', 1600),
 ('function', 1531),
 ('class', 1440),
 ('object', 1390),
 ('use', 1353),
 ('c++', 1217),
 ('multiple', 1152),
 ('method', 1123),
 ('list', 1093),
 ('text', 1073),
 ('page', 1025),
 ('rails', 992),
 ('image', 928),
 ('html', 916),
 ('form', 904),
 ('type', 891),
 ('code', 888),
 ('create', 881),
 ('set', 875),
 ('values', 863),
 ('working', 847),
 ('json', 835),
 ('mysql', 833),
 ('convert', 833),
 ('add', 825),
 ('server', 820),
 ('database', 816),
 ('android', 816),
 ('without', 791),
 ('aspnet', 790),
 ('variable', 773),
 ('way', 769),
 ('django', 765),
 ('change', 763),
 ('button', 746),
 ('one', 743),
 ('find', 737),
 ('application', 729),
 ('windows', 727),
 ('table', 692),
 ('ajax', 691),
 ('can', 690),
 ('not', 690),
 (

In [15]:
import operator
DICT_SIZE = 5000
ALL_WORDS =[i for i,j in sorted(words_counts.items(), key=operator.itemgetter(1),reverse=True)[:5000]]

WORDS_TO_INDEX = {}
for count, word in enumerate(ALL_WORDS):
    WORDS_TO_INDEX.update({word:count})

In [16]:
def my_bag_of_words(text, words_to_index, dict_size):

    result_vector = np.zeros(dict_size)
    
    for word in text.split():
        if word in words_to_index.keys():
            x= words_to_index[word]
            result_vector[x]=1
    return result_vector

In [17]:
from scipy import sparse as sp_sparse

In [18]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])

In [19]:
row = X_train_mybag[10].toarray()[0]
row

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
row = X_train_mybag[10].toarray()[0]
non_zero_elements_count = len(row[row != 0])

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def tfidf_features(X_train, X_val, X_test):
    tfidf_vectorizer = TfidfVectorizer(token_pattern = '(\S+)')
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)

    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [23]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [24]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [25]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [26]:
def train_classifier(X_train, y_train):
    
    model = OneVsRestClassifier(LogisticRegression(random_state=0)).fit(X_train, y_train)
    
    return model

In [27]:
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)



In [28]:
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [32]:
def print_evaluation_scores(y_val, predicted):
    
    print ("Accuracy",accuracy_score(y_val, predicted))
    print ("F1 Score",f1_score(y_val, predicted, average='macro'))
    print ("Average Precision",average_precision_score(y_val, predicted, average='macro')) 

In [33]:
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

Bag-of-words
Accuracy 0.4140333333333333


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  recall = tps / tps[-1]


F1 Score 0.5667977545606032
Average Precision nan
Tfidf
Accuracy 0.3365666666666667


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  recall = tps / tps[-1]


F1 Score 0.42895305088277075
Average Precision nan
