In [15]:
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
stop_words = set(stopwords.words('english'))

In [16]:
tp = pd.read_csv("sof.csv", nrows=1000)

In [17]:
df = tp[['Body', 'Title', 'Tags']].copy()
body_text = df["Body"].values   
df["Title"] = df["Title"].astype(str).apply(lambda x: x.lower())


Extract the code part of the body text

In [18]:
def clean_code_text(t): 
    compile1 = re.compile('<code>.*?</code>')
    compile2 = re.compile('<.*?>')
    compile3 = re.compile('&.*;')
    text = re.sub(compile2, ' ', t)
    text = re.sub(compile3, ' ', text)
    text = re.sub(compile1, ' ', text)
    text = re.sub(r"\n", "", text)
    return text.lower()

In [19]:
def extract_code(text):
    all_code = []
    code_dict = {}
    for i in range(text.shape[0]):
        t = clean_code_text(text[i])
        if "<code>" in text[i]:
            code_bs = BeautifulSoup(text[i])
            codelist = code_bs.findAll("code")
            for j in range(len(codelist)):
                codelist[j] = clean_code_text(str(codelist[j]))
            all_code.append(list(codelist))
            code_dict[i] = codelist
    return all_code

In [20]:
# Make a new column 'Code' for the extracted code part 
df["Code"] = pd.Series(extract_code(df["Body"].values))

clean the body text 

In [21]:
def remove_htmltags(t): 
    compile1 = re.compile('<code>.*? </code>')
    compile2 = re.compile('<.*?>')
    compile3 = re.compile('&.*;')
    text = re.sub(compile1, '', t)
    text = re.sub(compile2, '', text)
    text = re.sub(compile3, ' ', text)
    return text.lower()

In [22]:
def remove_chars(t): #function to clean the word of any punctuation/chars
    sub = re.sub(r'[?|!|"|#|:|=|+|_|{|}|[|]|-|$|%|^|&|]',r'',t)
    clean = re.sub(r'[.|,|)|(|\|/|-|~|`|>|<|*|$|@|;|â†’]',r'',sub)
    return  clean

In [23]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+';

In [24]:
def de_contract(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\n", "", phrase)
    return phrase

In [25]:
def nlp_preprocess(tt, index):
    if type(tt) is not int:
        string = ""
        for words in tt.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        return string

In [26]:
def preprocess_dataset(posts_text):
    for i in range(len(posts_text)):
        posts_text[i] = remove_htmltags(posts_text[i])
        posts_text[i] = re.sub(url_regex, '', posts_text[i] )
        posts_text[i] = remove_chars(posts_text[i])
        posts_text[i] = de_contract(posts_text[i])
        posts_text[i] = nlp_preprocess(posts_text[i], i)

In [27]:
preprocess_dataset(body_text)

Clean the tags

In [28]:
# Find all nan in Tags and replace with np.NAN
def find_nan(x):
    for i in range(len(x)):
        if x[i] == "nan":
            x[i] = np.nan
    return x
        

In [29]:
def clean_tags(t):
    t = t.astype(str).apply(lambda x: x.replace('<', ' '))
    t = t.astype(str).apply(lambda x: x.replace('>', ' '))
    return t 

In [30]:
def clean_all_tags(text):
    x = find_nan(text)
    c = clean_tags(x)
    return c 

In [31]:
df["Tags"] = clean_all_tags(df.Tags)

Get a list of how frequent a tag is in the dataset

In [32]:
from collections import Counter

In [33]:
#Frequency of tags 

def count_tags(x):
    if isinstance(x, str):
        count = x.split() 
        return (count)
    
def get_frequent_tags(df):   
    tags_count = df.apply(count_tags)
    lists = list(tags_count)

    flat_list = []

    for sublist in lists:
        if isinstance(sublist, list):
            for item in sublist:
                flat_list.append(item)
    counts = dict(Counter(flat_list))
    
    val_list = []
    tag_list = []
    for k, v in counts.items():
        val_list.append(v)
    sort_list = sorted(val_list, reverse=True)
    for i in range(len(counts)):
        for k,v in counts.items():
            if v == sort_list[i]:
                tag_list.append(k)
    return tag_list

In [34]:
tags = get_frequent_tags(df.Tags)

In [35]:
df.to_csv('out.csv')