Text Preprocessing
--------
Text preprocessing made on the competition datasets.
The preprocessing consists of 4 steps:

 1. **Removing tags and URIs from contents**
 2. **Removing punctuation from titles and contents**
 3. **Removing stopwords from titles and contents**
 4. **Converting the tags from string to a list of tags**

This type of operations can be used as a first step for any other process regarding the competition.

In [101]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string
import os
import itertools
import operator

Datasets loading
---------

In [113]:
dataframes = {
    "cooking": pd.read_csv("./input/cooking.csv"),
    "crypto": pd.read_csv("./input/crypto.csv"),
    "robotics": pd.read_csv("./input/robotics.csv"),
    "biology": pd.read_csv("./input/biology.csv"),
    "travel": pd.read_csv("./input/travel.csv"),
    "diy": pd.read_csv("./input/diy.csv"),
}

For simplicity, i'll show an example of the steps of the preprocessing on an item of the robotics dataset

In [104]:
print(dataframes["robotics"].iloc[1])

id                                                         2
title      How can I modify a low cost hobby servo to run...
content    <p>I've got some hobby servos (<a href="http:/...
tags                                         control rcservo
Name: 1, dtype: object


Removing html tags and uris from contents
-----------

In [22]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [114]:
# This could take a while
for df in dataframes.values():
    df["content"] = df["content"].map(stripTagsAndUris)

In [24]:
print(dataframes["robotics"].iloc[1])

id                                                         2
title      How can I modify a low cost hobby servo to run...
content    I've got some hobby servos (Power HD 1501MGs) ...
tags                                         control rcservo
Name: 1, dtype: object


Removing punctuation from titles and contents
-----------

In [25]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [115]:
for df in dataframes.values():
    df["title"] = df["title"].map(removePunctuation)
    df["content"] = df["content"].map(removePunctuation)

In [27]:
print(dataframes["robotics"].iloc[1])

id                                                         2
title      how can i modify a low cost hobby servo to run...
content    i ve got some hobby servos  power hd 1501mgs  ...
tags                                         control rcservo
Name: 1, dtype: object


Removing stopwords from titles and contents
-----------

In [35]:
stops = set(stopwords.words("english"))
numberRegex = re.compile(r"^\d+(th|rd|nd)?$")

def shouldInclude(word):
    if word in stops:           # Skip stopwords from NLTK
        return False
    if numberRegex.match(word) is not None: # Skip words like "2" or "3rd"
        return False
    if word[0] == '\\':         # Skip LaTeX commands beginning with \
        return False
    if len(word) <= 1:
        return False

    return True

def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if shouldInclude(word)]
    return " ".join(filtered_words)

In [116]:
def create2grams(text):
    strings = text.split(" ")
    return text + " " + " ".join(map(lambda pair: pair[0] + "-" + pair[1], zip(strings, strings[1:])))

In [117]:
for df in dataframes.values():
    df["title"] = df["title"].map(removeStopwords).map(create2grams)
    df["content"] = df["content"].map(removeStopwords).map(create2grams)

In [119]:
print(dataframes["robotics"].iloc[1])
print(dataframes["robotics"].iloc[1]['title'])

id                                                         2
title      modify low cost hobby servo run freely modify-...
content    got hobby servos power hd 1501mgs like able co...
tags                                         control rcservo
Name: 1, dtype: object
modify low cost hobby servo run freely modify-low low-cost cost-hobby hobby-servo servo-run run-freely


Splitting tags string in a list of tags
-----------

In [120]:
for df in dataframes.values():
    # From a string sequence of tags to a list of tags
    df["tags"] = df["tags"].map(lambda x: x.split())

In [121]:
print(dataframes["robotics"].iloc[1])

id                                                         2
title      modify low cost hobby servo run freely modify-...
content    got hobby servos power hd 1501mgs like able co...
tags                                      [control, rcservo]
Name: 1, dtype: object


Saving preprocessed dataframes to csv
-----------

In [122]:
for name, df in dataframes.items():
    # Saving to file
    df.to_csv(os.path.join(".", "preprocessed-input", "data-" + name, name + "_light.csv"), index=False)

In [123]:
for name, df in dataframes.items():
    with open(os.path.join(".", "preprocessed-input", "data-" + name, name + "_gibbs.txt"), "w") as f:
        f.write("{}\n".format(df.shape[0]))
        for line in df.iterrows():
            f.write(line[1]['title'] + " " + line[1]['content'] + "\n")

In [48]:
def process_lda_results():
    for name, df in dataframes.items():
        with open(os.path.join(".", "preprocessed-input", "data-" + name, "model-final.tassign"), "r") as f:
            tag_topics = {}
            for question, tassign in itertools.izip(df.iterrows(), f):
                for topic, in tassign.split(" "):


In [0]:
res = process_lda_results()

In [90]:
def display_ds(ds, n=10):
    for tag in res[ds]:
        topN = sorted(res[ds][tag].items(), key=operator.itemgetter(1), reverse=True)[:n]
        print(tag, ":", topN)

In [91]:
display_ds("robotics")
# res["robotics"]["soccer"]

soccer : [('57', 13), ('42', 12), ('21', 11), ('60', 6), ('95', 6), ('9', 6), ('25', 5), ('17', 5), ('87', 5), ('18', 5)]
control : [('60', 1773), ('99', 1773), ('24', 802), ('61', 755), ('95', 702), ('63', 691), ('42', 531), ('50', 501), ('58', 456), ('82', 437)]
rcservo : [('89', 441), ('22', 211), ('24', 143), ('95', 130), ('42', 123), ('54', 104), ('73', 100), ('29', 77), ('61', 69), ('83', 53)]
gait : [('12', 29), ('95', 16), ('57', 8), ('18', 6), ('89', 4), ('74', 3), ('84', 2), ('40', 2), ('14', 2), ('10', 2)]
walk : [('12', 96), ('24', 18), ('5', 12), ('42', 12), ('54', 12), ('36', 12), ('57', 11), ('83', 10), ('17', 10), ('98', 9)]
microcontroller : [('6', 724), ('99', 541), ('24', 482), ('56', 417), ('16', 303), ('7', 297), ('54', 280), ('42', 264), ('61', 240), ('95', 202)]
arduino : [('54', 1676), ('25', 1286), ('42', 1118), ('24', 824), ('7', 723), ('80', 719), ('60', 650), ('95', 547), ('35', 484), ('73', 445)]
raspberry-pi : [('79', 1766), ('60', 807), ('23', 735), ('24'

In [124]:
for name, df in dataframes.items():
    print("{} has {} tags".format(name, len(res[name])))


cooking has 736 tags
crypto has 392 tags
robotics has 231 tags
biology has 678 tags
travel has 1645 tags
diy has 734 tags
