# 2. Text Clustering Algorithm Development.
---

<img src = "https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/_images/plot_mean_shift_1.png" width = "400px">


We need a way to identify which nodes are worth to explore. That's why in this module, we develop a text clustering algorithm.

These module makes usage of web scraping techniques together with an aplication of deep learning for NLP (Natural Language Processing) models with the aim of obtaining valuable information from web sources.

In [75]:
import pandas as pd
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim

import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import requests
from bs4 import BeautifulSoup
import pandas as pd

from itertools import chain

import warnings
warnings.simplefilter("ignore")

# Scraping

In [76]:
%%time

#Categories we took into account due to it's relation with human traffic dynamics.
key_words_labels = [["sex_services",["scorts", "prepagos", "webcams sexuales"]],
                     ["job_offers", ["trabajos", "empleos", "oferta laboral"]],
                     ["models", ["modelaje", "academia modelos", "agencia modelos"]],
                    ["human_traffic", ["trata de personas", "trafico de personas", "comercio de personas"]],
                    ["massages", ["masajista", "masajista bogota", "masajista relajación"]]
                    ]

num_topics = len(key_words_labels)

#Geting the information for training purposes

def search_google(key_words, cluster, num_pages = 10):
    final_text = []
    clusters = []
    for page in range(num_pages):
    
        url_base = "https://www.google.com/search?q="
        result = requests.get(url_base + key_words+"&start="+ str(page) + "0")
        src = result.content
        soup = BeautifulSoup(src, 'html.parser')

        test = soup.find_all(class_ ="BNeawe s3v9rd AP7Wnd")
        for element in test:
            if element.string not in final_text and element.string != None:
                final_text.append(element.string)
                clusters.append(cluster)
    return [final_text, clusters]

def get_info(list_, num_pages):
    num_pages = num_pages
    final_search = []
    final_clusters = []

    for cluster in list_:
        
        text_cluster = cluster[0]
        
        for key_word in cluster[1]:
            key_words = key_word
            search = search_google(key_words, text_cluster, num_pages)
            final_search += search[0]
            final_clusters += search[1]

    return [final_clusters, final_search]
                              
search = get_info(key_words_labels, 10)

Wall time: 2min 26s


In [77]:
# Data obtained

df = pd.DataFrame({"topic":search[0], "text":search[1]})
df

Unnamed: 0,topic,text


# 1. Preprocessing of Data

In [78]:
#Preprocessing of data

stop = set(stopwords.words('english')).union(stopwords.words('spanish'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = " ".join([word for word in text.lower().split() if word not in stop])
    punct_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join([lemma.lemmatize(word) for word in punct_free.split()])
    return normalized.split()

In [79]:
df['text_clean'] = df["text"].apply(clean)
df

Unnamed: 0,topic,text,text_clean


In [80]:
#Dictionary of words

dictionary = corpora.Dictionary(df["text_clean"])
dictionary.num_nnz

0

In [81]:
#Term Matrix

doc_term_matrix = [dictionary.doc2bow(doc) for doc in df["text_clean"]]
doc_term_matrix

[]

# Implementation of LDA Model

In natural language processing, the latent Dirichlet allocation model (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. Here we are using it to cluster texts and by doing so we get a good insight about what information is usefull and what information should be discarted.

In [82]:
# Creating object
lda = gensim.models.ldamodel.LdaModel

In [None]:
#Training the model, here is where magic occurs :)
num_topics = len(key_words_labels)
%time ldamodel = lda(doc_term_matrix, num_topics = num_topics, id2word= dictionary, passes = 50, minimum_probability = 0)

In [13]:
# Probabilities of each cluster of text of interest, so in this case the probability for cluster 0 is about 94%.
corpus = ldamodel[doc_term_matrix]
corpus[10]

[(0, 0.9379641),
 (1, 0.015516543),
 (2, 0.015449975),
 (3, 0.015541942),
 (4, 0.015527468)]

# Analyzing the Model

In [15]:
#Information about clusters
ldamodel.print_topics(num_topics = num_topics)

[(0,
  '0.035*"·" + 0.012*"masaje" + 0.011*"webcam" + 0.009*"porno" + 0.008*"chicas" + 0.008*"bogotá" + 0.008*"sexo" + 0.007*"gratis" + 0.007*"empleo" + 0.006*"masajes"'),
 (1,
  '0.038*"modelos" + 0.022*"agencia" + 0.017*"modelaje" + 0.013*"academia" + 0.007*"porno" + 0.006*"masajes" + 0.006*"webcam" + 0.006*"masaje" + 0.006*"model" + 0.005*"bogotá"'),
 (2,
  '0.023*"persona" + 0.013*"trata" + 0.009*"colombia" + 0.007*"laboral" + 0.007*"oferta" + 0.005*"comercio" + 0.005*"mundo" + 0.005*"fotos" + 0.005*"masaje" + 0.004*"masajista"'),
 (3,
  '0.014*"persona" + 0.012*"trabajo" + 0.012*"bogotá" + 0.009*"trata" + 0.009*"escort" + 0.009*"empleo" + 0.009*"masajes" + 0.007*"·" + 0.007*"ofertas" + 0.007*"prepagos"'),
 (4,
  '0.016*"persona" + 0.011*"bogotá" + 0.011*"trabajo" + 0.010*"trata" + 0.007*"masajista" + 0.007*"empleos" + 0.006*"empleo" + 0.005*"ofertas" + 0.005*"tráfico" + 0.005*"scorts"')]

At the left a graphic of the different clusters that our model has learnt. This is done by the representation of words as mathematical objects such as vectors. At the right the keywords related to each cluster. 

In [17]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics = False, mds = "mmds")
pyLDAvis.display(lda_display)

In [20]:
#Geting a treshold for text_custering of unseen text
lda_corpus = ldamodel[doc_term_matrix]
scores = list(chain(*[[score for topic_id, score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/ len(scores)
threshold

0.1999999997639326

In [25]:
# Saving the model for usage in other modules.
from gensim.test.utils import datapath

temp_file = datapath("model")
ldamodel.save(temp_file)

dictionary.save("dictio")