In [1]:
from joblib import Parallel, delayed

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt
import numpy as np
from gensim.models.ldamulticore import LdaMulticore
import pandas as pd 
import pyLDAvis
import pyLDAvis.gensim


# Tokeniza cada texto en una lista de palabras
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations



In [2]:
#dataset_name = sys.argv[1]
#df = pd.read_csv(dataset_name)
df = pd.read_csv("datosProcesados.csv")


# Seleccionar las filas en las que la opinion sea negativa, mas tarde podemos probar con las positivas y las neutrales
X = df[df['__target__'] == 0]
X = X[X['airline'] == 'United']

# Paso todos los textos a una lista
textos = X.text.values.tolist()

# Paso cada texto de cada tweet de una string a una lista de palabras
data_words = list(sent_to_words(textos))

# Se crea el diccionario de las palabras; cada palabra unica contiene un identificador. Sirve para crear el corpus
id2word = corpora.Dictionary(data_words)

# Se crea el corpus
corpus = [id2word.doc2bow(text) for text in data_words]
#print(corpus[0])
# Cada palabra: (word_id, word_frequency). Si es (47,3) quiere decir que la palabra con id 47 aparece 3 veces en el documento



In [3]:
X

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,__target__
506,506,5.703071e+17,1.0000,Cancelled Flight,0.7030,United,CoralReefer420,0.0,unit still refund word via dm pleas resolv iss...,"[64.4459613, -149.680909]",2015-02-24 11:39:45 -0800,Alaska,Alaska,0
507,507,5.703070e+17,1.0000,Late Flight,1.0000,United,lsalazarll,0.0,unit delay due lack crew delay there long line...,"[39.7392364, -104.984862]",2015-02-24 11:39:25 -0800,Denver,Mountain Time (US & Canada),0
509,509,5.703062e+17,0.3475,Can't Tell,0.3475,United,samidip,0.0,unit eriord express connect huge popular could...,"[40.7587979, -73.9623427]",2015-02-24 11:36:12 -0800,"New York City, New York",Eastern Time (US & Canada),0
511,511,5.703049e+17,0.6667,Can't Tell,0.3333,United,andycheco,0.0,unit think board flight au1066 earli think,"[19.43706642, -99.07927123]",2015-02-24 11:31:01 -0800,"New York City, New York",Eastern Time (US & Canada),0
513,513,5.703020e+17,0.6735,Bad Flight,0.3476,United,slandail,0.0,unit gate agent hook altern flight way prevent...,"[40.7587979, -73.9623427]",2015-02-24 11:19:32 -0800,"New York City, New York",Eastern Time (US & Canada),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4321,4321,5.676140e+17,1.0000,Customer Service Issue,0.3545,United,JustOGG,0.0,unit link current statu flightsairport fli bwi...,"[40.7587979, -73.9623427]",2015-02-17 01:18:29 -0800,"New York City, New York",Eastern Time (US & Canada),0
4322,4322,5.675957e+17,1.0000,Late Flight,1.0000,United,CRomerDome,0.0,unit like delay less im one plane connect voucher,"[37.7790262, -122.419906]",2015-02-17 00:05:27 -0800,"San Francisco, California",Pacific Time (US & Canada),0
4323,4323,5.675946e+17,1.0000,Bad Flight,0.6707,United,brenduch,0.0,unit dont hope nicer flight time tri thing rig...,"[-34.6075682, -58.4370894]",2015-02-17 00:01:07 -0800,Buenos Aires,Buenos Aires,0
4324,4324,5.675924e+17,1.0000,Late Flight,1.0000,United,brenduch,0.0,unit got gate iah time given seat close flight...,"[-34.6075682, -58.4370894]",2015-02-16 23:52:20 -0800,Buenos Aires,Buenos Aires,0


In [4]:
lda_model = LdaMulticore(corpus=corpus,  
    id2word=id2word,
    num_topics=5, 
    random_state=100,
    chunksize=100,
    passes=20,
    iterations=100,
    eval_every= None,
    per_word_topics=True,
    workers=8)
    
# La idea es tener la mejor similitud entre los textos de un topico mientras que se minimiza la similitud con los textos de otros
# La coherencia relaciona la distancia intracluster con la distancia intercluster
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


# Imprime los topicos; por cada topico muestra su id y luego las palabras mas frecuentes con la frecuencia de esa palabra en ese topico
print(lda_model.print_topics())


[(0, '0.083*"unit" + 0.072*"flight" + 0.031*"delay" + 0.018*"hour" + 0.017*"cancel" + 0.015*"get" + 0.012*"miss" + 0.012*"plane" + 0.010*"connect" + 0.010*"wait"'), (1, '0.084*"unit" + 0.022*"flight" + 0.016*"bag" + 0.011*"check" + 0.008*"seat" + 0.008*"im" + 0.008*"ua" + 0.008*"go" + 0.007*"fli" + 0.007*"problem"'), (2, '0.086*"unit" + 0.016*"flight" + 0.013*"servic" + 0.012*"custom" + 0.012*"gate" + 0.010*"hour" + 0.009*"ticket" + 0.008*"bag" + 0.008*"plane" + 0.008*"get"'), (3, '0.084*"unit" + 0.012*"time" + 0.012*"flight" + 0.009*"call" + 0.009*"amp" + 0.009*"get" + 0.008*"help" + 0.007*"book" + 0.007*"email" + 0.007*"refund"'), (4, '0.096*"unit" + 0.023*"custom" + 0.022*"servic" + 0.013*"airlin" + 0.011*"bag" + 0.010*"worst" + 0.009*"ever" + 0.008*"get" + 0.007*"im" + 0.007*"time"')]


In [5]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis