In [1]:
import numpy as np
import pandas as pd

##### Loading the dataset(StackSample: 10% of Stack Overdlow Q&A) <https://www.kaggle.com/datasets/stackoverflow/stacksample>

In [2]:
df_Ques = pd.read_csv('Questions.csv', encoding = 'latin')
df_Tags = pd.read_csv('Tags.csv', encoding = 'latin')

In [3]:
df_Ques.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [4]:
df_Tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


## Preprocessing on tags

In [5]:
df_Tags['Tag'] = df_Tags['Tag'].astype(str)
grouped_tags = df_Tags.groupby('Id')['Tag'].apply(lambda tags: ' '.join(tags))

In [6]:
grouped_tags.sample(5)

Id
1608370               asp.net xss validation
11330160              cordova sencha-touch-2
3445420                  html redirect owasp
15914570                  java methods input
30690270    c# .net web-services wcf service
Name: Tag, dtype: object

In [7]:
grouped_t = grouped_tags

In [8]:
grouped_tags = grouped_t

In [9]:
grouped_tags.head()

Id
80                            flex actionscript-3 air
90       svn tortoisesvn branch branching-and-merging
120                               sql asp.net sitemap
180    algorithm language-agnostic colors color-space
260           c# .net scripting compiler-construction
Name: Tag, dtype: object

In [10]:
grouped_tags.reset_index()

Unnamed: 0,Id,Tag
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
...,...,...
1264211,40143210,php .htaccess
1264212,40143300,google-bigquery
1264213,40143340,android android-studio
1264214,40143360,javascript vue.js


In [11]:
grp_tags = pd.DataFrame({'Id': grouped_tags.index, 'Tags': grouped_tags.values})
grp_tags.head(5)

Unnamed: 0,Id,Tags
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction


In [12]:
df_Ques.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace = True)

df = df_Ques.merge(grp_tags, on = 'Id')
df.head()

Unnamed: 0,Id,Score,Title,Body,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [13]:
import nltk

df_new = df[df['Score']>5]

df_new['Tags'] = df_new['Tags'].apply(lambda x: x.split())
all_tags = [item for sublist in df_new['Tags'] for item in sublist]
tags_flat = all_tags

keywords = nltk.FreqDist(tags_flat)
# keywords = nltk.FreqDist(keywords)

word_freq = keywords.most_common(25)
tags_features = [word[0] for word in word_freq]

df_new.drop(columns = ['Id', 'Score'], inplace = True)
print(tags_features)

['c#', 'java', 'javascript', 'android', 'python', 'c++', 'php', 'jquery', '.net', 'ios', 'html', 'css', 'c', 'iphone', 'objective-c', 'ruby-on-rails', 'sql', 'asp.net', 'mysql', 'ruby', 'r', 'git', 'asp.net-mvc', 'linux', 'sql-server']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Tags'] = df_new['Tags'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.drop(columns = ['Id', 'Score'], inplace = True)


In [14]:
def most_common_tags(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

df_new['Tags'] = df_new['Tags'].apply(lambda x: most_common_tags(x))
df_new['Tags'] = df_new['Tags'].apply(lambda x: x if len(x)>0 else None)

df_new.dropna(subset = 'Tags', inplace = True)
df_new.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Tags'] = df_new['Tags'].apply(lambda x: most_common_tags(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Tags'] = df_new['Tags'].apply(lambda x: x if len(x)>0 else None)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.dropna(subset = 'Tags', inplace = True)


(52418, 3)

## Preprocessing on Body and Title

In [15]:
# Removing html tags
from bs4 import BeautifulSoup
import lxml

df_new['Body'] = df_new['Body'].apply(lambda x: BeautifulSoup(x, 'html').get_text())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Body'] = df_new['Body'].apply(lambda x: BeautifulSoup(x, 'html').get_text())


In [16]:
import re
import string
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

token = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.strip(' ')
    return text

def clean_punct(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def lemmatizeWords(text):
    words = token.tokenize(text)
    listLemma = []
    for w in words:
        x = lemmatizer.lemmatize(w, pos = 'v')
        listLemma.append(x)
    return ' '.join(map(str, listLemma))

def remove_stopwords(text):
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    return ' '.join(map(str, filtered))

In [17]:
text_len = np.vectorize(len)
temp_df = df_new.copy()
text_lengths = text_len(df_new['Body'])

In [18]:
temp_df['BodyLen'] = text_lengths

In [19]:
mean_bodylen = int(text_lengths.mean())
print('Mean body length: ', mean_bodylen)
print('Min body length : ', text_lengths.min())
print('Max body length : ', text_lengths.max())

Mean body length:  1061
Min body length :  18
Max body length :  29408


In [20]:
temp_df

Unnamed: 0,Title,Body,Tags,BodyLen
2,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...,"[sql, asp.net]",348
4,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...,"[c#, .net]",1035
5,Should I use nested classes in this case?,I am working on a collection of classes used f...,[c++],947
6,Homegrown consumption of web services,I've been writing a few web services for a .ne...,[.net],302
7,Deploying SQL Server Databases from Test to Live,I wonder how you guys manage deployment of a d...,[sql-server],1452
...,...,...,...,...
1262668,Using lambda in default initializer gcc vs clang,#include <cassert>\n\nint main()\n{\n struc...,[c++],579
1262834,STL list very bad performance,"It's supposed that ""push_back"" and ""pop_front""...",[c++],3575
1262915,How to use a dict to subset a DataFrame?,"Say, I have given a DataFrame with most of the...",[python],1155
1263065,Is there a way to use itertools in python to c...,"Let's say I have the following code:\na = [1,2...",[python],225


In [21]:
#Remove stopwords, punctuations and lemmatize text in body
df_new['Body'] = df_new['Body'].apply(lambda x: clean_text(x))
df_new['Body'] = df_new['Body'].apply(lambda x: clean_punct(x))
df_new['Body'] = df_new['Body'].apply(lambda x: lemmatizeWords(x))
df_new['Body'] = df_new['Body'].apply(lambda x: remove_stopwords(x))

# df_new['Body'] = df_new['Body'].apply(lambda x: x[: mean_bodylen])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Body'] = df_new['Body'].apply(lambda x: clean_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Body'] = df_new['Body'].apply(lambda x: clean_punct(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Body'] = df_new['Body'].apply(lambda x: lemmatizeWords(x))
A value is 

In [22]:
#Remove stopwords, punctuations and lemmatize text in body
df_new['Title'] = df_new['Title'].apply(lambda x: str(x))
df_new['Title'] = df_new['Title'].apply(lambda x: clean_text(x))
df_new['Title'] = df_new['Title'].apply(lambda x: clean_punct(x))
df_new['Title'] = df_new['Title'].apply(lambda x: lemmatizeWords(x))
df_new['Title'] = df_new['Title'].apply(lambda x: remove_stopwords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Title'] = df_new['Title'].apply(lambda x: str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Title'] = df_new['Title'].apply(lambda x: clean_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Title'] = df_new['Title'].apply(lambda x: clean_punct(x))
A value is tryin

In [23]:
#Data is finally cleaned and preprocessed
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52418 entries, 2 to 1263454
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   52418 non-null  object
 1   Body    52418 non-null  object
 2   Tags    52418 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


In [24]:
df_new.head()

Unnamed: 0,Title,Body,Tags
2,aspnet site map,anyone get experience create sqlbased aspnet s...,"[sql, asp.net]"
4,add script functionality net applications,little game write c use database backend trade...,"[c#, .net]"
5,use nest class case,work collection class use video playback recor...,[c++]
6,homegrown consumption web service,ive write web service net app im ready consume...,[.net]
7,deploy sql server databases test live,wonder guy manage deployment database 2 sql se...,[sql-server]


In [25]:
df_new['Combined_text'] = df_new['Title'] + ' ' + df_new['Body']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Combined_text'] = df_new['Title'] + ' ' + df_new['Body']


In [26]:
df_new

Unnamed: 0,Title,Body,Tags,Combined_text
2,aspnet site map,anyone get experience create sqlbased aspnet s...,"[sql, asp.net]",aspnet site map anyone get experience create s...
4,add script functionality net applications,little game write c use database backend trade...,"[c#, .net]",add script functionality net applications litt...
5,use nest class case,work collection class use video playback recor...,[c++],use nest class case work collection class use ...
6,homegrown consumption web service,ive write web service net app im ready consume...,[.net],homegrown consumption web service ive write we...
7,deploy sql server databases test live,wonder guy manage deployment database 2 sql se...,[sql-server],deploy sql server databases test live wonder g...
...,...,...,...,...
1262668,use lambda default initializer gcc vs clang,include cassert int main struct pointofparabal...,[c++],use lambda default initializer gcc vs clang in...
1262834,stl list bad performance,suppose pushback popfront methods stl list imp...,[c++],stl list bad performance suppose pushback popf...
1262915,use dict subset dataframe,say give dataframe columns categorical data da...,[python],use dict subset dataframe say give dataframe c...
1263065,way use itertools python clean nest iterations,let say follow code 123 b 246 c 357 j b k c pr...,[python],way use itertools python clean nest iterations...


## Data Preparation

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
multilabel = MultiLabelBinarizer()
tfidf = TfidfVectorizer()

X1 = df_new['Body']
X2 = df_new['Title']
y = df_new['Tags']
XC3 = df_new['Combined_text']

y_ml = multilabel.fit_transform(y)

In [28]:
y_ml.shape

(52418, 25)

In [29]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'asp.net-mvc', 'c', 'c#', 'c++',
       'css', 'git', 'html', 'ios', 'iphone', 'java', 'javascript',
       'jquery', 'linux', 'mysql', 'objective-c', 'php', 'python', 'r',
       'ruby', 'ruby-on-rails', 'sql', 'sql-server'], dtype=object)

In [30]:
X1_tfidf = tfidf.fit_transform(X1)
X2_tfidf = tfidf.fit_transform(X2)
XC3_tfidf = tfidf.fit_transform(XC3)
X_tfidf = np.hstack([X1_tfidf, X2_tfidf])

In [31]:
XC3_tfidf.shape

(52418, 430819)

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XC3_tfidf, y_ml, test_size = 0.2, random_state = 0)

In [33]:
X_train.shape

(41934, 430819)

In [34]:
y_train.shape

(41934, 25)

In [35]:
X_test.shape

(10484, 430819)

In [36]:
y_test.shape

(10484, 25)

## Classification and Prediction

In [37]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss, precision_score, recall_score, f1_score

dt = DecisionTreeClassifier()
sgd = SGDClassifier()
lr = LogisticRegression()
svc = LinearSVC()
mn = MultinomialNB()
prec_dict = {}
hamloss_dict = {}

In [39]:
for classifier in [sgd, lr, mn, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    ham = hamming_loss(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average = 'weighted')
    
    clsname = classifier.__class__.__name__
    prec_dict[clsname] = prec
    hamloss_dict[clsname] = ham

    print('Classifier     : ', clsname)
    print('hamming_loss   : ', ham)
    print('Precision score: ', prec)
    print('Recall         : ', recall_score(y_test, y_pred, average = 'weighted'))
    print('f1-score       : ', f1_score(y_test, y_pred, average = 'weighted'))
    print('\n\n')

Classifier     :  SGDClassifier
hamming_loss   :  0.03074017550553224
Precision score:  0.8946268953218229
Recall         :  0.427877315517765
f1-score       :  0.5520549404865779



Classifier     :  LogisticRegression
hamming_loss   :  0.031362075543685614
Precision score:  0.8586037971357732
Recall         :  0.43448223504403277
f1-score       :  0.564536353179406





  _warn_prf(average, modifier, msg_start, len(result))


Classifier     :  MultinomialNB
hamming_loss   :  0.050125906142693626
Precision score:  0.29092013361676283
Recall         :  0.00258123291831157
f1-score       :  0.0050246595093471225



Classifier     :  LinearSVC
hamming_loss   :  0.02555513162914918
Precision score:  0.8426545410965445
Recall         :  0.589356210142727
f1-score       :  0.6885049773502129





In [44]:
x = [ 'how to write ml code in python and java i have data but do not know how to do it']
xt = tfidf.fit_transform(x)
for classifier in [sgd, lr]:
    clf = OneVsRestClassifier(classifier)
    clf.predict(xt)
    multilabel.inverse_transform(clf.predict(xt))
    print('\n\n')

NotFittedError: This OneVsRestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.