In [1]:
import pandas as pd
import warnings
import numpy as np
import seaborn as sns
from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import lxml
import re
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

In [2]:
questions = pd.read_csv("Questions.csv", encoding="ISO-8859-1")
tags = pd.read_csv("Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})

In [3]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [4]:
questions.shape, tags.shape

((1264216, 7), (3750994, 2))

In [5]:
tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [6]:
tags['Tag'] = tags['Tag'].astype('str')

In [7]:
group_tags = tags.groupby('Id')['Tag'].apply(lambda x: ' '.join(x))

In [8]:
group_tags.head()

Id
80                            flex actionscript-3 air
90       svn tortoisesvn branch branching-and-merging
120                               sql asp.net sitemap
180    algorithm language-agnostic colors color-space
260           c# .net scripting compiler-construction
Name: Tag, dtype: object

In [9]:
group_tags.reset_index()
group_tags = pd.DataFrame({'Id':group_tags.index, 'Tag':group_tags.values})
group_tags.head()

Unnamed: 0,Id,Tag
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction


In [10]:
group_tags.shape

(1264216, 2)

In [11]:
questions.drop(['OwnerUserId','CreationDate', 'ClosedDate'], axis = 1, inplace =True)

In [12]:
data = questions.merge(group_tags, on = 'Id')

In [13]:
data.head()

Unnamed: 0,Id,Score,Title,Body,Tag
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [14]:
new_data = data[data['Score']>10]

In [15]:
new_data.head()

Unnamed: 0,Id,Score,Title,Body,Tag
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction


In [16]:
data.shape, new_data.shape

((1264216, 5), (32604, 5))

In [17]:
new_data['Tag'] = new_data['Tag'].apply(lambda x: x.split())

In [18]:
new_data.head()

Unnamed: 0,Id,Score,Title,Body,Tag
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,"[flex, actionscript-3, air]"
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,"[svn, tortoisesvn, branch, branching-and-merging]"
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net, sitemap]"
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"[algorithm, language-agnostic, colors, color-s..."
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net, scripting, compiler-construction]"


In [19]:
# all_tags = [x for i in new_data['Tag'].values for x in i]

In [20]:
# all_tags

In [19]:
vectorizer = CountVectorizer( tokenizer=lambda x: x.split())

In [20]:
tags_counter = vectorizer.fit_transform(data['Tag'])

In [21]:
tags_counter.shape[0], tags_counter.shape[1]

(1264216, 37035)

In [22]:
tags_counter

<1264216x37035 sparse matrix of type '<class 'numpy.int64'>'
	with 3750993 stored elements in Compressed Sparse Row format>

In [23]:
new_data['counts'] = [len(count) for count in new_data['Tag']]

In [24]:
new_data.head()

Unnamed: 0,Id,Score,Title,Body,Tag,counts
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,"[flex, actionscript-3, air]",3
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,"[svn, tortoisesvn, branch, branching-and-merging]",4
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net, sitemap]",3
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"[algorithm, language-agnostic, colors, color-s...",4
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net, scripting, compiler-construction]",4


In [25]:
flat_tags = vectorizer.get_feature_names()
flat_tags[:10]

['.a',
 '.app',
 '.aspxauth',
 '.bash-profile',
 '.class-file',
 '.doc',
 '.emf',
 '.git-info-grafts',
 '.hgtags',
 '.htaccess']

In [26]:
freqs = tags_counter.sum(axis=0).A1
result = list(zip(flat_tags, freqs))
columns = ['Tags', 'Counts']
tag_df = pd.DataFrame(result, columns=columns)
tag_df.head()

Unnamed: 0,Tags,Counts
0,.a,13
1,.app,14
2,.aspxauth,6
3,.bash-profile,46
4,.class-file,19


In [27]:
# Sorting tag data frame with Counts to know most repeating tags

tag_df.sort_values(['Counts'], ascending=False, inplace=True)

In [28]:
freq_tags = tag_df[tag_df['Counts']>47000]

In [29]:
freq_tags.head(20)

Unnamed: 0,Tags,Counts
16121,javascript,124155
16020,java,115212
4301,c#,101186
24212,php,98808
979,android,90659
16611,jquery,78542
25713,python,64601
14180,html,58976
4311,c++,47591
15565,ios,47009


In [30]:
most_common_words = freq_tags['Tags'].values

In [31]:
frequencies_words = most_common_words
tags_features = [word for word in frequencies_words]

In [32]:
len(tags_features)

10

In [33]:
# new_data.drop(columns=['Id', 'Score'], inplace=True)
print(tags_features)

['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios']


In [34]:
new_df = new_data

In [35]:
def most_common(tags):
    """Function to check if tag is in most common tag list"""
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

In [36]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag,counts
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,"[flex, actionscript-3, air]",3
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,"[svn, tortoisesvn, branch, branching-and-merging]",4
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net, sitemap]",3
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"[algorithm, language-agnostic, colors, color-s...",4
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net, scripting, compiler-construction]",4


In [37]:
# Change Tags column into None for questions that don't have a most common tag
new_df['Tag'] = new_df['Tag'].apply(lambda x: most_common(x))
new_df['Tag'] = new_df['Tag'].apply(lambda x: x if len(x)>0 else None)

In [38]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag,counts
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,,3
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,,4
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,,3
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,,4
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,[c#],4


In [39]:
new_df.dropna(subset=['Tag'], inplace=True)
new_df.drop(['counts'], axis = 1, inplace=True)
new_df.shape

(17897, 5)

In [40]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,[c#]
5,330,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,[c++]
8,650,79,Automatically update version number,<p>I would like the version property of my app...,[c#]
10,930,28,How do I connect to a database and loop over a...,<p>What's the simplest way to connect and quer...,[c#]
11,1010,14,"How to get the value of built, encoded ViewState?",<p>I need to grab the base64-encoded represent...,[c#]


## Preprocess Data ##

#### Remove special characters from title and body##
#### Remove stop words###
#### Remove HTML tags###
#### Convert characters to lowercase###
#### Lemmatize the words###

In [41]:
new_df['Body'] = new_df['Body'].apply(lambda x: BeautifulSoup(x, "lxml").get_text()) 

In [42]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag
4,260,49,Adding scripting functionality to .NET applica...,I have a little game written in C#. It uses a ...,[c#]
5,330,29,Should I use nested classes in this case?,I am working on a collection of classes used f...,[c++]
8,650,79,Automatically update version number,I would like the version property of my applic...,[c#]
10,930,28,How do I connect to a database and loop over a...,What's the simplest way to connect and query a...,[c#]
11,1010,14,"How to get the value of built, encoded ViewState?",I need to grab the base64-encoded representati...,[c#]


In [43]:
token = ToktokTokenizer()
lemma = WordNetLemmatizer()
stop = set(stopwords.words("english"))

In [44]:
new_df['Body'] = new_df['Body'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
new_df['Body'] = new_df['Body'].apply(lambda x: re.sub('[^\w\s][!"$%&\'()*,./:;<=>?@[\\]^_`{|}~]', '', x))
new_df['Body'] = new_df['Body'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
new_df['Body'] = new_df['Body'].apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split()))

In [45]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag
4,260,49,Adding scripting functionality to .NET applica...,little game written c us database back-end. tr...,[c#]
5,330,29,Should I use nested classes in this case?,working collection class used video playback r...,[c++]
8,650,79,Automatically update version number,would like version property application increm...,[c#]
10,930,28,How do I connect to a database and loop over a...,what's simplest way connect query database set...,[c#]
11,1010,14,"How to get the value of built, encoded ViewState?",need grab base64-encoded representation viewst...,[c#]


In [46]:
new_df['Title'] = new_df['Title'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
new_df['Title'] = new_df['Title'].apply(lambda x: re.sub('[^\w\s][!"$%&\'()*,./:;<=>?@[\\]^_`{|}~]', '', x))
new_df['Title'] = new_df['Title'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
new_df['Title'] = new_df['Title'].apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split()))
new_df['Title'] = new_df['Title'].apply(lambda x: ' '.join(x.split()*3))

In [47]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag
4,260,49,adding scripting functionality .net applicatio...,little game written c us database back-end. tr...,[c#]
5,330,29,use nested class case? use nested class case? ...,working collection class used video playback r...,[c++]
8,650,79,automatically update version number automatica...,would like version property application increm...,[c#]
10,930,28,connect database loop recordset c connect data...,what's simplest way connect query database set...,[c#]
11,1010,14,"get value built, encoded viewstate? get value ...",need grab base64-encoded representation viewst...,[c#]


In [48]:
new_df['modified_Body'] = new_df["Body"].map(str) + ' ' + new_df["Title"]

In [49]:
length = [len(word_tokenize(x)) for x in new_df['modified_Body']]

length

[135,
 125,
 93,
 25,
 83,
 56,
 73,
 49,
 40,
 104,
 161,
 42,
 52,
 96,
 42,
 53,
 63,
 99,
 131,
 140,
 43,
 179,
 127,
 39,
 84,
 158,
 50,
 71,
 73,
 140,
 52,
 59,
 162,
 260,
 98,
 42,
 86,
 28,
 57,
 76,
 348,
 80,
 90,
 34,
 31,
 41,
 28,
 61,
 31,
 77,
 77,
 138,
 47,
 91,
 60,
 69,
 92,
 38,
 35,
 177,
 176,
 41,
 83,
 53,
 32,
 41,
 57,
 110,
 94,
 111,
 235,
 43,
 65,
 48,
 39,
 98,
 130,
 62,
 84,
 45,
 81,
 54,
 56,
 108,
 86,
 147,
 121,
 414,
 59,
 111,
 93,
 70,
 88,
 38,
 90,
 66,
 39,
 755,
 115,
 88,
 82,
 106,
 93,
 97,
 45,
 91,
 79,
 49,
 47,
 39,
 104,
 117,
 116,
 42,
 160,
 69,
 95,
 190,
 201,
 92,
 36,
 182,
 162,
 58,
 143,
 294,
 35,
 58,
 29,
 49,
 70,
 24,
 30,
 72,
 77,
 67,
 100,
 68,
 30,
 22,
 57,
 66,
 24,
 56,
 33,
 41,
 83,
 86,
 56,
 125,
 350,
 57,
 38,
 124,
 223,
 62,
 52,
 97,
 56,
 45,
 27,
 128,
 104,
 45,
 116,
 62,
 72,
 50,
 248,
 44,
 228,
 71,
 49,
 41,
 48,
 178,
 44,
 189,
 40,
 29,
 288,
 32,
 122,
 321,
 30,
 87,
 53,
 45,
 34,
 1

In [50]:
new_df.head()

Unnamed: 0,Id,Score,Title,Body,Tag,modified_Body
4,260,49,adding scripting functionality .net applicatio...,little game written c us database back-end. tr...,[c#],little game written c us database back-end. tr...
5,330,29,use nested class case? use nested class case? ...,working collection class used video playback r...,[c++],working collection class used video playback r...
8,650,79,automatically update version number automatica...,would like version property application increm...,[c#],would like version property application increm...
10,930,28,connect database loop recordset c connect data...,what's simplest way connect query database set...,[c#],what's simplest way connect query database set...
11,1010,14,"get value built, encoded viewstate? get value ...",need grab base64-encoded representation viewst...,[c#],need grab base64-encoded representation viewst...


In [51]:
max(length)

7683

In [52]:
np.quantile(length,0.9)

266.0

In [53]:
new_df.to_csv('preprocessed_data.csv',index=False)