### Imports

In [82]:
import pandas as pd
import numpy as np

import re

from tqdm import tqdm
tqdm.pandas()

from collections import Counter

# nltk library imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [83]:
### Run nltk download command once to get required stopwords
# nltk.download('punkt')
# nltk.download('stopwords')

stop = stopwords.words('english')
ps=PorterStemmer()

### Functions

In [84]:
def topn_tags(dataFrame ,n):
    """
    This function returns a list of top n tags 
    :param dataFrame: pandas dataFrame
    :param n: integer
    :returns: list of strings
    """
    dataFrame['Tags'] = dataFrame['Tags'].apply(lambda x : x.split(' '))
    
    counter = Counter()
    _ = df['Tags'].apply(counter.update)
    
    unique_tags = counter.most_common(n)       # Top n tags
    tags_df = pd.DataFrame(unique_tags, columns=['Tags','Freq'])
    unique_tags = tags_df.Tags.tolist()
    
    return unique_tags
    

def separate_code_and_body(body):
    """
    This function returns a list of two strings - Code and Description from the given Body content
    :param string: text string of body
    :returns: list of strings containing Code and Body
    """
    code_snippets = re.finditer("<code.*?>(.*?)</code>", body, re.DOTALL)
    code = []
    description = body
    for snip in code_snippets:
        code.append(snip.group())
        body = body.replace(snip.group(),"")
    return [code, body]  


def convert_to_lower(text):
    """
    This function returns a string with lowercase characters
    :param string: text string
    :returns: text string
    """
    return text.lower()

def remove_special_chars(text):
    """
    This function returns a string with filtered special case characters except # and ++ for C# and C++
    :param string: text string
    :returns: text string
    """
    return re.sub(r"[^A-Za-z #++]+",'', text)

def get_list_intersection(x, y):
    """
    This function returns a list which is intersection of two input lists
    :param list: list of strings
    :param list: list of strings
    :returns: Intersection list of strings
    """
    return list(set(x) & set(y))


def remove_html_tags(text):
    """
    This function returns a string after removing html tags from a string
    :param string: text string
    :returns: text string
    """
    clean = re.compile('<.*?>')
    return re.sub(clean,'', text)

def remove_n(text):
    """
    This function returns a string after removing new line char(\n) from a string
    :param string: text string
    :returns: text string
    """
    clean1 = re.compile('\n')
    return re.sub(clean1,'', text)

def get_processed_tags(tags, tag_ids, n):
    """
    This function returns a list of binary values of tags size length for tags represention
    :param list: list of tags
    :param Dictionary: Dict of tags and their ids
    :param int: Number of frequent tags
    :returns: text string
    """
    label = [0]*n
    for tag in tags:
        label[tag_ids[tag]] = 1
    return label

### Read Data

In [85]:
df = pd.read_csv('./../data/processed/train.csv', nrows=15000)

### Number of Most frequent tags taken into consideration
n = 100

### Data Pre - Processing

#### Tags Filtering

In [86]:
# Converting Tags to lower case
df['Tags'] = df['Tags'].progress_apply(lambda x: convert_to_lower(x))

# Drop nan Tag values
df.dropna(subset=['Tags'],inplace=True)

# Filtering to Top 100 tags on Frequency
unique_tags = topn_tags(df,n)

# Filtering the Dataset for top 100 tags
df['Tags'] = df['Tags'].apply(lambda x : get_list_intersection(x,unique_tags))
df = df[df['Tags'].map(lambda d: len(d)) > 0]
df.reset_index(drop=True, inplace=True)

100%|█████████████████████████████████| 15000/15000 [00:00<00:00, 411305.74it/s]


#### Separate Code from Body

In [87]:
df['Code'] = df['Body'].progress_apply(lambda x : separate_code_and_body(x))
df[['Code','Description']] = pd.DataFrame(df.Code.tolist(), index= df.index)

100%|██████████████████████████████████| 11310/11310 [00:00<00:00, 47736.41it/s]


#### Convert to Lower

In [88]:
df['Title'] = df['Title'].progress_apply(lambda x:  convert_to_lower(x))
df['Description'] = df['Description'].progress_apply(lambda x:  convert_to_lower(x))

100%|█████████████████████████████████| 11310/11310 [00:00<00:00, 364218.04it/s]
100%|█████████████████████████████████| 11310/11310 [00:00<00:00, 320333.71it/s]


#### Description Processing

In [89]:
df['Description'] = df['Description'].progress_apply(lambda x: remove_html_tags(x))
df['Description'] = df['Description'].progress_apply(lambda x: remove_n(x))
df['Description'] = df['Description'].progress_apply(lambda x: remove_special_chars(x))
df['Description'] = df['Description'].progress_apply(lambda x: word_tokenize(x))
df['Description'] = df['Description'].progress_apply(lambda x: [item for item in x if item not in stop])
df['Description'] = df['Description'].progress_apply(lambda x: [ps.stem(word) for word in x])

100%|██████████████████████████████████| 11310/11310 [00:00<00:00, 99229.96it/s]
100%|█████████████████████████████████| 11310/11310 [00:00<00:00, 175738.34it/s]
100%|██████████████████████████████████| 11310/11310 [00:00<00:00, 55979.03it/s]
100%|███████████████████████████████████| 11310/11310 [00:03<00:00, 2938.68it/s]
100%|███████████████████████████████████| 11310/11310 [00:02<00:00, 5480.99it/s]
100%|███████████████████████████████████| 11310/11310 [00:09<00:00, 1235.01it/s]


#### Title Processing

In [90]:
df['Title'] = df['Title'].progress_apply(lambda x: remove_n(x))
df['Title'] = df['Title'].progress_apply(lambda x: remove_special_chars(x))
df['Title'] = df['Title'].progress_apply(lambda x: word_tokenize(x))
df['Title'] = df['Title'].progress_apply(lambda x: [item for item in x if item not in stop])
df['Title'] = df['Title'].progress_apply(lambda x: [ps.stem(word) for word in x])

100%|█████████████████████████████████| 11310/11310 [00:00<00:00, 241657.34it/s]
100%|█████████████████████████████████| 11310/11310 [00:00<00:00, 232249.11it/s]
100%|██████████████████████████████████| 11310/11310 [00:01<00:00, 10488.74it/s]
100%|██████████████████████████████████| 11310/11310 [00:00<00:00, 25281.54it/s]
100%|██████████████████████████████████| 11310/11310 [00:01<00:00, 10048.10it/s]


#### Merge Title and Body

In [91]:
df['Text'] = df['Title'] + df['Description']
df['Text'] = df['Text'].progress_apply(lambda x: list(set(x)))
df = df[['Text','Tags']]

100%|██████████████████████████████████| 11310/11310 [00:00<00:00, 86308.52it/s]


In [92]:
df

Unnamed: 0,Text,Tags
0,"[type, png, matter, php, id, jpg, uploadi, eg,...",[php]
1,"[type, fileaft, frame, load, code, variabl, li...",[r]
2,"[sure, charact, buri, there, find, #, objectsi...",[c#]
3,"[display, warn, help, use, contact, mesageplea...","[api, php]"
4,"[figur, sinc, matter, work, window, createhttp...","[c#, asp.net, windows-phone-7]"
...,...,...
11305,"[solutionsmi, drawrect, good, devic, howev, wo...",[iphone]
11306,"[via, u, android, php, task, guy, databas, loc...","[android, php]"
11307,"[handler, current, display, fire, label, howev...","[wpf, c#]"
11308,"[server, sql, see, condit, statement, like, pr...","[sql, sql-server]"


In [93]:
df.to_csv('./../data/processed/data.csv',index=False)

In [64]:
tag_ids = {}

for i in range(len(unique_tags)):
    tag_ids[unique_tags[i]] = i

# df['Tags'] = df['Tags'].progress_apply(lambda x : get_processed_tags(x, tag_ids, n))

In [65]:
tag_ids

{'c#': 0,
 'java': 1,
 'php': 2,
 'javascript': 3,
 'android': 4,
 'jquery': 5,
 'c++': 6,
 'asp.net': 7,
 '.net': 8,
 'python': 9,
 'iphone': 10,
 'html': 11,
 'mysql': 12,
 'sql': 13,
 'ios': 14,
 'css': 15,
 'linux': 16,
 'ruby-on-rails': 17,
 'objective-c': 18,
 'c': 19,
 'windows': 20,
 'ruby': 21,
 'sql-server': 22,
 'xml': 23,
 'wpf': 24,
 'database': 25,
 'ajax': 26,
 'asp.net-mvc': 27,
 'arrays': 28,
 'regex': 29,
 'xcode': 30,
 'windows-7': 31,
 'facebook': 32,
 'osx': 33,
 'performance': 34,
 'multithreading': 35,
 'networking': 36,
 'vb.net': 37,
 'eclipse': 38,
 'ruby-on-rails-3': 39,
 'linq': 40,
 'actionscript-3': 41,
 'html5': 42,
 'django': 43,
 'algorithm': 44,
 'json': 45,
 'visual-studio-2010': 46,
 'flash': 47,
 'string': 48,
 'wcf': 49,
 'oracle': 50,
 'winforms': 51,
 'entity-framework': 52,
 'bash': 53,
 'sql-server-2008': 54,
 'asp.net-mvc-3': 55,
 'ubuntu': 56,
 'silverlight': 57,
 'ipad': 58,
 'query': 59,
 'email': 60,
 'wordpress': 61,
 'hibernate': 62,
 'i

In [36]:
counter = Counter()
_ = df['Text'].apply(counter.update)

In [37]:
vocab = [item for item in counter]

In [38]:
len(vocab)

35802

In [39]:
df

Unnamed: 0,Text,Tags
0,"[type, png, matter, php, id, jpg, uploadi, eg,...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[close, window, still, vim, press, certain, st...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[type, fileaft, frame, load, code, variabl, li...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[sure, charact, buri, there, find, #, objectsi...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[display, warn, help, use, contact, mesageplea...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
7621,"[sure, jsp, byt, figur, especi, display, know,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7622,"[test, deadlock, sqlcommandexecutenonqueri, ca...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
7623,"[view, visibl, click, work, id, last, process,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
7624,"[map, android, code, within, suppos, debug, ap...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [40]:
df.to_csv('./../data/processed/data.csv',index=False)