In [109]:
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import unidecode
from gensim import corpora
import gensim

import nltk
stemmer = nltk.stem.SnowballStemmer('portuguese')

#login = 108561

#archackathon.database.windows.net
#AcrConsultada
#Arc123456

### Lendo os dados

In [110]:
ranking_complete = pd.read_csv('data/ranking_complete.csv', encoding='latin-1')
ranking_complete = ranking_complete[ranking_complete['Pages'] > 2]

In [111]:
ranking_complete.head()

Unnamed: 0,Conforto / Acabamento,Consumo,Custo / Benefício,Design,Dirigibilidade,Manutenção,Performance,Marca,Modelo,Ano,Pages,Texto Positivo,Price,ranking,Value,vendas_normalizada
5,3.0,9.0,9.0,3.0,7.0,8.0,7.0,FIAT,UNO FURGÃO,2016,3,\r\r\nPontos positivos: responde bem ao que se...,30252.0,,0.0,0.0
6,5.8,8.2,7.3,6.8,7.3,7.8,7.2,VOLKSWAGEN,BRASÍLIA,1981,5,\r\r\nPontos positivos: todo bom\r\r\r\n ...,17800.0,,0.0,0.0
7,9.0,9.0,9.5,10.0,10.0,9.0,10.0,VOLKSWAGEN,FUSCA,2016,7,\r\r\nPontos positivos: confiança no uso diári...,107352.0,,0.0,0.0
8,5.0,0.0,5.0,10.0,7.0,8.0,7.0,FIAT,FIORINO,2019,4,\r\r\nPontos positivos: bom carro para trabalh...,60070.0,,0.0,0.0
9,6.5,8.5,9.0,6.2,9.0,9.5,9.2,FORD,COURIER,2013,5,\r\r\nPontos positivos: resist?ncia e performa...,24940.0,,0.0,0.0


### Limpando os textos

In [112]:
def clean_text(text):
    text = text.replace('Pontos positivos', '').replace('\n', '').replace('\r', '')
    
    tokens = text.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens if len(w) > 2]
    
    stop_words = stopwords.words('portuguese')
    stop_words.extend(['carro', 'bom', 'ótimo', 'boa', 'bem', 'excelente', 'otimo'])
    
    stop_words = set(stop_words)
    
    tokens = [unidecode.unidecode(stemmer.stem(w)) for w in tokens if not w in stop_words]
    
    
    return tokens

### Fitando o modelo

In [113]:
transformer = CountVectorizer(analyzer=clean_text)
transformer.fit(ranking_complete['Texto Positivo'])
result = transformer.transform(ranking_complete['Texto Positivo'])

In [114]:
result_df = pd.DataFrame(result.todense(), columns= transformer.get_feature_names())

In [115]:
pd.DataFrame(result_df.sum()).sort_values(0, ascending=False)

Unnamed: 0,0
confort,1925
espac,1033
motor,954
consum,869
dirigibil,845
econom,808
design,803
intern,748
segur,701
acab,613


### Aplicando LDA

In [116]:
ranking_complete['bag_words'] = ranking_complete['Texto Positivo'].apply(lambda x: clean_text(x))

In [117]:
dictionary = corpora.Dictionary(ranking_complete['bag_words'])
corpus = [dictionary.doc2bow(text) for text in ranking_complete['bag_words']]

In [118]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.008*"econom" + 0.004*"fort" + 0.003*"barat" + 0.003*"veicul" + 0.003*"motor" + 0.003*"baix"')
(1, '0.016*"design" + 0.015*"confort" + 0.013*"consum" + 0.012*"motor" + 0.012*"acab" + 0.011*"segur"')
(2, '0.012*"confort" + 0.011*"design" + 0.006*"segur" + 0.005*"estabil" + 0.005*"espac" + 0.004*"motor"')
(3, '0.034*"confort" + 0.020*"espac" + 0.016*"motor" + 0.015*"consum" + 0.015*"econom" + 0.014*"dirigibil"')
(4, '0.028*"confort" + 0.018*"design" + 0.014*"motor" + 0.013*"dirigibil" + 0.012*"espac" + 0.011*"segur"')


In [119]:
#new_doc = prepare_text_for_lda(new_doc)
def get_group_lda(list_text):
    #new_doc = clean_text(list_text)
    new_doc_bow = dictionary.doc2bow(list_text)
    #print(new_doc_bow)
    result = ldamodel.get_document_topics(new_doc_bow)[0][0]
    
    return result

In [120]:
ranking_complete['Grupo'] = ranking_complete['bag_words'].apply(lambda x: get_group_lda(x))

In [129]:
ranking_complete[ranking_complete['Grupo'] == 4]

Unnamed: 0,Conforto / Acabamento,Consumo,Custo / Benefício,Design,Dirigibilidade,Manutenção,Performance,Marca,Modelo,Ano,Pages,Price,ranking,Value,vendas_normalizada,Grupo
14,6.2,6.2,5.1,6.9,7.2,4.1,6.8,FIAT,DUCATO,2018,3,108500.0,,0.0,0.0,4
68,7.6,7.6,7.6,8.7,8.4,6.7,8.0,PEUGEOT,206 SW,2008,7,15518.0,,0.0,0.0,4
85,8.7,7.0,7.4,9.5,9.0,7.6,7.6,HYUNDAI,I30,2016,7,70160.0,,0.0,0.0,4
105,9.5,7.5,9.0,10.0,9.5,9.0,9.0,CITROËN,C4 PICASSO,2019,6,134990.0,,0.0,0.0,4
108,7.6,6.4,7.1,8.6,7.6,7.5,7.7,HYUNDAI,HB20,2019,7,43990.0,2.0,50.419,0.562586,4
109,8.7,8.3,8.0,9.4,9.3,8.5,9.5,CHEVROLET,TRACKER,2018,7,85890.0,26.0,13.44,0.149967,4
111,7.9,8.4,8.2,9.1,8.6,6.2,8.9,JAC,J2,2016,5,42390.0,,0.0,0.0,4
123,9.8,7.8,8.8,7.8,9.6,7.8,9.2,FIAT,MAREA,2007,7,16050.0,,0.0,0.0,4
144,8.6,7.2,7.8,9.3,9.1,7.1,8.3,FIAT,TORO,2019,6,90990.0,,0.0,0.0,4
151,8.7,6.3,7.8,9.4,8.9,7.5,8.7,VOLKSWAGEN,NEW BEETLE,2010,4,39175.0,,0.0,0.0,4


In [130]:
ranking_complete[ranking_complete['Grupo'] == 0]

Unnamed: 0,Conforto / Acabamento,Consumo,Custo / Benefício,Design,Dirigibilidade,Manutenção,Performance,Marca,Modelo,Ano,Pages,Price,ranking,Value,vendas_normalizada,Grupo
24,7.6,9.4,8.4,7.2,7.8,6.4,7.7,PEUGEOT,106,2001,5,7331.0,,0.0,0.0,0
27,8.5,7.5,6.5,9.5,8.5,7.0,6.5,KIA,SOUL,2019,3,89990.0,,0.0,0.0,0
30,8.5,7.7,8.3,8.3,8.5,8.0,8.7,FORD,PAMPA,1997,4,11941.0,,0.0,0.0,0
43,6.1,6.2,7.1,7.0,7.2,6.9,7.3,CHEVROLET,CLASSIC,2016,7,28914.0,,0.0,0.0,0
56,8.0,7.5,7.0,7.5,9.0,7.0,7.5,TROLLER,T4,2018,6,128994.0,,0.0,0.0,0
67,7.9,8.3,8.2,8.0,8.6,8.3,8.1,CHERY,FACE,2015,5,23605.0,,0.0,0.0,0
92,9.5,9.2,6.5,8.5,9.8,6.5,9.2,LAND ROVER,FREELANDER,2015,4,121437.0,,0.0,0.0,0
103,7.8,8.1,7.7,9.0,8.5,6.9,7.8,SEAT,IBIZA,2002,3,8827.0,,0.0,0.0,0
147,8.2,8.5,8.1,9.2,8.7,8.1,8.0,CITROËN,C3,2019,7,50390.0,47.0,3.639,0.040605,0
220,8.1,8.7,8.6,9.1,9.3,8.4,9.1,VOLKSWAGEN,POLO,2019,7,50490.0,4.0,34.138,0.380919,0


In [123]:
ranking_complete = ranking_complete.drop(['Texto Positivo', 'bag_words'], 1)

In [124]:
ranking_complete.to_csv('ranking_complete_group.csv', index=False)