### Imports

In [1]:
import pandas as pd
import numpy as np

import re
import time

from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

### Functions

In [2]:
def drop_nan_tags(dataframe):
    
    """
    Arguments 
        dataframe : Pandas dataFrame
    Function:
        Removes nan values present as float in the dataset
        Does inplce operation on given dataset
    Returns
        Nothing
    
    """  
    tags = df.loc[:, 'Tags']
    tags = tags.to_numpy()
    
    nonstrArr = []                            # Non String array
    for index,tag in enumerate(tags):
        if not(isinstance(tag, str)):         # If data type is not a string
            nonstrArr.append([index,tag])
    
    nonstrArr = np.array(nonstrArr)           # change to numpy array
    nonstrArr = nonstrArr[:,0]
    to_remove =list(map(lambda x: int(x) ,nonstrArr)) # typecast as int
    
    dataframe = dataframe.drop(to_remove, axis=0, inplace=True) 
    dataframe = dataframe.reset_index(drop=True, inplace=True)
    return dataframe
   

def topn_tags(dataFrame ,n):
    """
    This function returns a list of top n tags 
    :param dataFrame: pandas dataFrame
    :param n: integer
    :returns: list of strings
    """
    dataFrame['Tags'] = dataFrame['Tags'].apply(lambda x : x.split(' '))
    
    counter = Counter()
    _ = df['Tags'].apply(counter.update)
    
    unique_tags = counter.most_common(n)       # Top n tags
    tags_df = pd.DataFrame(unique_tags, columns=['Tags','Freq'])
    unique_tags = tags_df.Tags.tolist()
    
    return unique_tags
    

def separate_code_and_body(body):
    code_snippets = re.finditer("<code.*?>(.*?)</code>", body, re.DOTALL)
    code = []
    description = body
    for snip in code_snippets:
        code.append(snip.group())
        body = body.replace(snip.group(),"")
    return [code, body]  


def convert_to_lower(text):
    return text.lower()

def remove_special_chars(text):
    return re.sub(r"[^A-Za-z ]+",'', text)

def get_intersection(x, y):
    return list(set(x) & set(y))


def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean,'', text)

def remove_n(text):
    """Remove \n tags from a string"""
    clean1 = re.compile('\n')
    return re.sub(clean1,'', text)

def string_to_tokens(text):
    return word_tokenize(text)

def filter_tokens(text):
    text_token=string_to_tokens(text)
    filtered_sentence=[]
    for w in text_token:
        if w not in stop_words or w=='c':
            filtered_sentence.append(w)
    return filtered_sentence

### Read Data

In [213]:
df = pd.read_csv('./../data/processed/train.csv')

### Data Pre - Processing

#### Tags Filtering

In [None]:
# Drop nan Tag values
dropnantags(df)

# Filtering to Top 100 tags on Frequency
unique_tags = topn_tags(df,100)

# Filtering the Dataset for top 100 tags
df['Tags'] = df['Tags'].apply(lambda x : get_intersection(x,unique_tags))
df = df[df['Tags'].map(lambda d: len(d)) > 0]
df = df.reset_index(drop=True, inplace=True)

#### Separate Code from Body

In [214]:
df['Code'] = df['Body'].progress_apply(lambda x : separate_code_and_body(x))
df[['Code','Description']] = pd.DataFrame(df.Code.tolist(), index= df.index)

100%|██████████████████████████████████| 10000/10000 [00:00<00:00, 45250.78it/s]


#### Convert to Lower

In [215]:
df['Title'] = df['Title'].progress_apply(lambda x:  convert_to_lower(x))
df['Description'] = df['Description'].progress_apply(lambda x:  convert_to_lower(x))
df['Tags'] = df['Tags'].progress_apply(lambda x: convert_to_lower(x))

100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 365131.67it/s]
100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 313363.22it/s]
100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 501543.02it/s]


#### Description Processing

In [None]:
df['Description'] = df['Description'].progress_apply(lambda x: remove_html_tags(x))
df['Description'] = df['Description'].progress_apply(lambda x: remove_n(x))
df['Description'] = df['Description'].progress_apply(lambda x: remove_special_chars(x))
df['Description_Tokens'] = df['Description'].progress_apply(lambda x: filter_tokens(x))