## PROCON

In [1]:
import numpy as np
import pandas as pd
import keras
import tensorboard
import re
import os
import codecs
import nltk 

from nltk.stem import RSLPStemmer
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

np.random.seed(13)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
allFiles = [
    "data//raw//reclamacoes-fundamentadas-sindec-2012.csv",
    "data//raw//reclamacoes-fundamentadas-sindec-2013.csv",
    "data//raw//reclamacoes-fundamentadas-sindec-2014.csv",
    "data//raw//reclamacoes-fundamentadas-sindec-2015.csv",
    "data//raw//reclamacoes-fundamentadas-sindec-2016.csv"    
]


lista_dfs = []
procon = pd.DataFrame()
for file in allFiles:
    df = pd.read_csv(file, index_col=None, header=0, low_memory=False)
    lista_dfs.append(df)
procon = pd.concat(lista_dfs)


In [42]:
print("Procon data, shape: ",  procon.shape)

#numSampleRow = 5000
#procon = procon.sample(numSampleRow)

DescricaoProblema = (procon.iloc[:,19])
DescricaoAssunto = (procon.iloc[:,17])

DescricaoAssuntoProblema =  DescricaoAssunto + " "+ DescricaoProblema


Procon data, shape:  (1206072, 23)


In [21]:
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('perluniprops')


#nltk's stopwords
stopwords = nltk.corpus.stopwords.words('portuguese')

#nltk's SnowballStemmer
stemmer = SnowballStemmer("portuguese")

#Def of functions to process problems description texts
def removeStopWords(sentence):
    stopwords1 = nltk.corpus.stopwords.words('portuguese')
    phrase = []
    for word in sentence:
        if word not in stopwords1:
            phrase.append(word)
    return phrase

def tokenizeAndStem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


[nltk_data] Downloading package punkt to /home/hadoopen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hadoopen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/hadoopen/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


In [22]:
from nltk.tokenize.moses import MosesDetokenizer
detokenizer = MosesDetokenizer()

vocabularyTokenized = []
vocabularyTokenizedAndStemmed = []
descricaoAssuntoProblemaAdjusted = []

for i in DescricaoAssuntoProblema:
    #Not considering 'nan' data
    if type(i) == str:
        i = i.replace('/', ' ') 
        i = i.replace('etc', ' ') 
        
        #Remove stopwords and tokenize
        newString = detokenizer.detokenize(removeStopWords(tokenize(i)), return_str=True)
        descricaoAssuntoProblemaAdjusted.append(newString)
        
        wordsTokenizedAndStemmed = tokenizeAndStem(newString) 
        vocabularyTokenizedAndStemmed.extend(wordsTokenizedAndStemmed)
        wordsTokenized = tokenize(newString)
        vocabularyTokenized.extend(wordsTokenized)
    else: #in case of 'nan'
        descricaoAssuntoProblemaAdjusted.append("")
        vocabularyTokenizedAndStemmed.extend("")
        vocabularyTokenized.extend("")
        

#Vocabulary of dataframe with stemmed vocabulary as index and the tokenized words as the column.
vocabularyDescription = pd.DataFrame({"words": vocabularyTokenized}, index = vocabularyTokenizedAndStemmed)
vocabularyDescription = vocabularyDescription.drop_duplicates()

vocabularyDescription.count()

words    1176
dtype: int64

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Vectorizer parameters
tfidfVectorizer = TfidfVectorizer(max_df=0.8,
                                   min_df=0.01,
                                   stop_words=nltk.corpus.stopwords.words('portuguese'),
                                   use_idf=True,
                                   tokenizer=tokenizeAndStem, ngram_range=(1,3))

tfidfMatrix = tfidfVectorizer.fit_transform(descricaoAssuntoProblemaAdjusted) #fit the vectorizer to synopses

#Add terms on vocabulary
terms = tfidfVectorizer.get_feature_names()      


In [26]:
#Dimentional reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

tfidfReduced = TruncatedSVD(n_components=2, random_state=0).fit_transform(tfidfMatrix)
tfidfEmbedded = TSNE(n_components=2, verbose=2, perplexity=40,).fit_transform(tfidfReduced)

In [27]:
#Selecting the number of clusters with silhouette analysis
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

rangeClusters = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 35, 40, 50, 60, 70, 80, 90, 100]

for numberClusters in rangeClusters:
    clusterer = KMeans(n_clusters=numberClusters, random_state=10)
    cluster_labels = clusterer.fit_predict(tfidfEmbedded)

    # The silhouette_score gives the average value for all the samples.
    silhouette_avg = silhouette_score(tfidfEmbedded, cluster_labels)
    print("For n clusters =", numberClusters,
          "The average silhouette_score is :", silhouette_avg)


In [29]:
#KMeans Custering
from sklearn.cluster import KMeans

#Best cluster number based on silhouette analysis
numClusters = 80

km = KMeans(n_clusters=numClusters)

km.fit(tfidfMatrix)

clusters = km.labels_.tolist()

In [30]:
#ploting main finds about clusters

problems = {'problem': descricaoAssuntoProblemaAdjusted, 'cluster': clusters}
clustersFrame = pd.DataFrame(problems, index = [clusters] , columns = ['problem', 'cluster'])

orderCentroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(numClusters):
    clusterCount = clustersFrame[clustersFrame['cluster'] == i]['cluster'].value_counts()
    print("Cluster %d, Count: %d \nMain terms:" % (i, clusterCount), end='')
    
    for ind in orderCentroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocabularyDescription.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print("\n")

Cluster 0, Count: 9859 
Main terms: lavar, lavar, lavar, louça, roupa, roupa,

Cluster 1, Count: 7176 
Main terms: crédito, crédito, cartão, cartão, crédito, cartão,

Cluster 2, Count: 19235 
Main terms: entrega, etc, etc, entrega, demora, demora,

Cluster 3, Count: 21221 
Main terms: produtos, informática, informática, microcomputador, produtos, microcomputador,

Cluster 4, Count: 15385 
Main terms: banco, banco, comercial, entrega, indevida, retenção,

Cluster 5, Count: 43331 
Main terms: produtos, produtos, informática, informática, produtos, microcomputador,

Cluster 6, Count: 52550 
Main terms: fixa, plano, venda, compra, compra, telefone,

Cluster 7, Count: 65245 
Main terms: interfone, etc, etc, interfone, convencional, telefone,

Cluster 8, Count: 12907 
Main terms: video-laser, filmadora, video-laser, televisão, televisão, televisão,

Cluster 9, Count: 21471 
Main terms: serviço, cumprimento, entrega, entrega, instalação, instalação,

Cluster 10, Count: 5191 
Main terms: curso

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


Cluster 26, Count: 14662 
Main terms: incorporação, construtoras, construtoras, incorporação, incorporação, contrato,

Cluster 27, Count: 13318 
Main terms: duvidas, duvidas, duvidas, sobre, sobre, cobrança,

Cluster 28, Count: 35283 
Main terms: entrega, entrega, demora, entrega, entrega, demora,

Cluster 29, Count: 3774 
Main terms: entrega, financeira, contrato, compra, valor, rescisão,

Cluster 30, Count: 12895 
Main terms: publicidade, publicidade, enganosa, oferta, venda, cursos,

Cluster 31, Count: 14153 
Main terms: compra, desistência, cancelamento, desistência, compra, compra,

Cluster 32, Count: 9803 
Main terms: calculo, calculo, prestação, taxas, cartão, cartão,

Cluster 33, Count: 12131 
Main terms: turísticos, pacotes, pacotes, viagens, operadoras, agências,

Cluster 34, Count: 29932 
Main terms: cassete, video-laser, televisão, filmadora, filmadora, cassete,

Cluster 35, Count: 13276 
Main terms: seguro, saúde, outros, indevida, cobrança, cobrança,

Cluster 36, Count: 1

In [31]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

#Ploting clusters distribution chart
fig = plt.figure(figsize = (10, 10))
ax = plt.axes()
plt.scatter(tfidfEmbedded[:, 0], tfidfEmbedded[:, 1], marker = "x", c = km.labels_)
plt.show()

In [69]:
#Add cluster to procon database
proconResult = procon

clusterSerie = pd.Series(clusters)
assuntoProblemaSerie = pd.Series(descricaoAssuntoProblemaAdjusted)
proconResult['cluster'] = clusterSerie.values
proconResult['descAssuntoProblema'] = assuntoProblemaSerie.values


In [70]:
#create data frame with clusters final results with problems description

#Print the final procon database
proconResult.to_csv("data//final//consumer-complaints-clustering-result.csv", sep='\t', encoding='utf-8')


#Print only descricaoAssuntoProblema with clusters
#dfResult = pd.DataFrame(dict(label=clusters, title=descricaoAssuntoProblemaAdjusted))
#dfResult.to_csv("data//clustersResult.csv", sep='\t', encoding='utf-8')