The mission of this project is to build a model to predict the possible tags belong to a question which has been asked on Stack Overflow based on the text content of that particular question.

# 1. Import packages and collect data

In [43]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.multiclass import OneVsRestClassifier

import ast

In [44]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv', index_col= 0)

In [45]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [46]:
df.shape

(48976, 2)

Our dataset has more than 48 thounsands questions that have been posted on Stack Overflow along with their tags.

In [47]:
type(df.iloc[0]['Tags'])

str

In [48]:
# Convert Tags column to list type
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))

In [49]:
type(df.iloc[0]['Tags'])

list

# 2. Prepare the data

## 2.1. Multi-label binarization for tags

In [50]:
multilabel = MultiLabelBinarizer()

y = multilabel.fit_transform(df['Tags'])

In [51]:
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [52]:
y.shape

(48976, 20)

Our data has 20 unique tag names in total.

In [53]:
classes = multilabel.classes_

In [54]:
classes

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

## 2.2. Tokenizing the text data

In [55]:
X = df['Text']

tfidf = TfidfVectorizer(analyzer= 'word',max_features= 1000, ngram_range = (1,3), stop_words='english')

X = tfidf.fit_transform(X).toarray()

In [56]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.18908578,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.56036175],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [57]:
tfidf.vocabulary_

{'aspnet': 75,
 'site': 801,
 'got': 392,
 'creating': 221,
 'default': 239,
 'xml': 998,
 'file': 347,
 'working': 988,
 'properly': 675,
 'menu': 554,
 'need': 576,
 'way': 975,
 'users': 942,
 'create': 217,
 'modify': 563,
 'pages': 618,
 'page': 617,
 'standard': 827,
 'adding': 34,
 'functionality': 374,
 'net': 580,
 'applications': 61,
 'little': 513,
 'game': 376,
 'written': 995,
 'uses': 943,
 'database': 229,
 'wanted': 973,
 'implement': 429,
 'function': 373,
 'mean': 550,
 'interface': 458,
 'class': 160,
 'implements': 433,
 'public': 683,
 'contains': 200,
 'called': 131,
 'make': 538,
 'thing': 886,
 'like': 500,
 'source': 812,
 'code': 166,
 'compile': 179,
 'use': 937,
 'just': 477,
 'add': 32,
 'tell': 878,
 'application': 60,
 'assembly': 76,
 'means': 551,
 'possible': 649,
 'new': 582,
 'language': 485,
 'extra': 330,
 'write': 993,
 'script': 763,
 'public class': 684,
 'source code': 813,
 'nested': 579,
 'classes': 161,
 'case': 137,
 'collection': 167,
 'us

## 2.3. Prepare training and test sets

In [58]:
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)

In [59]:
X_train.shape,X_test.shape

((39180, 1000), (9796, 1000))

# 3. Build and train the model

In [60]:
from sklearn.multiclass import OneVsRestClassifier

In [61]:
def j_score(y_true,y_pred):
  jaccard = np.minimum(y_true,y_pred).sum(axis = 1)/np.maximum(y_true,y_pred).sum(axis = 1)
  return jaccard.mean()*100


In [62]:
lr = LogisticRegression(solver = 'lbfgs')

clf = OneVsRestClassifier(estimator=lr,n_jobs = -1)

In [63]:
clf.fit(X_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=-1)

In [64]:
y_pred = clf.predict(X_test)

In [66]:
j_score(y_test,y_pred)

48.06247447937933

In [69]:
x = ['how can i write function in python?','why is it so hard to extract data from database by sql?']

In [70]:
x = tfidf.transform(x)

clf.predict(x)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [71]:
multilabel.inverse_transform((clf.predict(x)))

[('python',), ('sql',)]

In [73]:
import pickle as pkl

pkl.dump(clf,open('lr_multilabel.pkl','wb'))
pkl.dump(tfidf,open('tfidf-multilabel.pkl','wb'))