### Data cleaning

In [1]:
import pandas as pd
import numpy as np 
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
stop_words = set(stopwords.words('english'))

In [2]:
tp = pd.read_csv("postsonly.csv", nrows=10000)

FileNotFoundError: [Errno 2] File b'postsonly.csv' does not exist: b'postsonly.csv'

In [None]:
tp

In [None]:
tp.head()

In [None]:
df = tp[['ID', 'Body', 'Title', 'Tags', 'Score']].copy()
body_text = df["Body"].values   
df["Title"] = df["Title"].astype(str).apply(lambda x: x.lower())

Extract the code part of the body text

In [None]:
def clean_code_text(t): 
    compile1 = re.compile('<code>.*?</code>')
    compile2 = re.compile('<.*?>')
    compile3 = re.compile('&.*;')
    text = re.sub(compile2, ' ', t)
    text = re.sub(compile3, ' ', text)
    text = re.sub(compile1, ' ', text)
    text = re.sub(r"\n", "", text)
    return text.lower()

In [None]:
def extract_code(text):
    all_code = []
    code_dict = {}
    for i in range(text.shape[0]):
        t = clean_code_text(text[i])
        if "<code>" in text[i]:
            code_bs = BeautifulSoup(text[i])
            codelist = code_bs.findAll("code")
            for j in range(len(codelist)):
                codelist[j] = clean_code_text(str(codelist[j]))
            all_code.append(list(codelist))
            code_dict[i] = codelist
    return all_code

In [None]:
# Make a new column 'Code' for the extracted code part 
df["Code"] = pd.Series(extract_code(df["Body"].values))

clean the body text 

In [None]:
def remove_htmltags(t): 
    compile1 = re.compile('<code>.*? </code>')
    compile2 = re.compile('<.*?>')
    compile3 = re.compile('&.*;')
    text = re.sub(compile1, '', t)
    text = re.sub(compile2, '', text)
    text = re.sub(compile3, ' ', text)
    return text.lower()

In [None]:
def remove_chars(t): #function to clean the word of any punctuation/chars
    sub = re.sub(r'[?|!|"|#|:|=|+|_|{|}|[|]|-|$|%|^|&|]',r'',t)
    clean = re.sub(r'[.|,|)|(|\|/|-|~|`|>|<|*|$|@|;|â†’]',r'',sub)
    return  clean

In [None]:
re_url = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+';

In [None]:
def de_contract(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def preprocessing_text(tt, index):
    if type(tt) is not int:
        string = ""
        for w in tt.split():
            word = ("".join(e for e in w if e.isalnum()))
            if not word in stop_words:
                string += word + " "
        return process_string

In [None]:
def preprocess_dataset(posts_text):
    for i in range(len(posts_text)):
        posts_text[i] = remove_htmltags(posts_text[i])
        posts_text[i] = re.sub(re_url, '', posts_text[i] )
        posts_text[i] = remove_chars(posts_text[i])
        posts_text[i] = de_contract(posts_text[i])
        posts_text[i] = preprocessing_text(posts_text[i], i)

In [None]:
preprocess_dataset(body_text)

Clean the tags

In [None]:
# Find all nan in Tags and replace with np.NAN
def find_nan(x):
    for i in range(len(x)):
        if x[i] == "nan":
            x[i] = np.nan
    return x
        

In [None]:
def clean_tags(t):
    t = t.astype(str).apply(lambda x: x.replace('<', ' '))
    t = t.astype(str).apply(lambda x: x.replace('>', ' '))
    return t 

In [None]:
def clean_all_tags(text):
    x = find_nan(text)
    c = clean_tags(x)
    return c 

In [None]:
for k in df["Tags"]:
    df["Tags"] = clean_tags(df["Tags"])

### Extract keywords using PyTextrank

In [None]:
import spacy
import pytextrank

In [None]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

In [None]:
df = df.reset_index()

In [None]:
df = df.drop(columns={"index"}, errors='ignore')
body_text = df["Body"].values

In [None]:
phrase_list = []

for t in range(len(body_text)):
    doc = nlp(body_text[t])
    for p in doc._.phrases:
        p = str(p).split()
        phrase_list.append(p)
        break

In [None]:
df["Keywords"] = pd.Series(phrase_list)

In [None]:
s = df.Keywords.astype(str).apply(lambda x: x.replace('[', ''))
s = s.astype(str).apply(lambda x: x.replace(']', ''))
df["Keywords"] = s 

In [None]:
s = df.Code.astype(str).apply(lambda x: x.replace('[', ''))
s = s.astype(str).apply(lambda x: x.replace(']', ''))
df["Code"] = s 

In [None]:
t = df.Keywords.astype(str).apply(lambda x: x.replace("'", ''))
df["Keywords"] = t 

t = df.Keywords.astype(str).apply(lambda x: x.replace(",", ''))
df["Keywords"] = t 


In [None]:
t = df.Code.astype(str).apply(lambda x: x.replace("'", ''))
df["Code"] = t 

t = df.Code.astype(str).apply(lambda x: x.replace(",", ''))
df["Code"] = t 

In [None]:
def find_nan(x):
    if x == "nan":
        x = np.nan
    return x
        

In [None]:
t = df.Keywords.astype(str).apply(lambda x: find_nan(x))
df["Keywords"] = t 


In [None]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)  

In [None]:
m = df.Code.astype(str).apply(lambda x: x.replace("\\r", ''))
df["Code"] = m

In [None]:
m = df.Code.astype(str).apply(lambda x: find_nan(x))
df["Code"] = m 

In [None]:
df.to_csv("Clean_post_data.csv")

In [None]:
df