In [2]:
from joblib import Parallel, delayed

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt
import numpy as np
from gensim.models.ldamulticore import LdaMulticore
import pandas as pd 
import pyLDAvis
import pyLDAvis.gensim


# Tokeniza cada texto en una lista de palabras
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations



In [42]:
#dataset_name = sys.argv[1]
#df = pd.read_csv(dataset_name)
df = pd.read_csv("datosProcesados.csv")


# Seleccionar las filas en las que la opinion sea negativa, mas tarde podemos probar con las positivas y las neutrales
X = df[df['__target__'] == 0]
X = X[X['airline'] == 'US Airways']

# Paso todos los textos a una lista
textos = X.text.values.tolist()

# Paso cada texto de cada tweet de una string a una lista de palabras
data_words = list(sent_to_words(textos))

# Se crea el diccionario de las palabras; cada palabra unica contiene un identificador. Sirve para crear el corpus
id2word = corpora.Dictionary(data_words)

# Se crea el corpus
corpus = [id2word.doc2bow(text) for text in data_words]
#print(corpus[0])
# Cada palabra: (word_id, word_frequency). Si es (47,3) quiere decir que la palabra con id 47 aparece 3 veces en el documento



In [43]:
X

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,__target__
8966,8966,5.703106e+17,0.6292,Flight Booking Problems,0.3146,US Airways,jhazelnut,0.0,usairway better time call flight friday need c...,"[46.3144754, 11.0480288]",2015-02-24 11:53:37 -0800,,,0
8967,8967,5.703101e+17,1.0000,Customer Service Issue,1.0000,US Airways,GAKotsch,0.0,usairway one agent avail speak,"[45.1960403, -63.1653789]",2015-02-24 11:51:48 -0800,"Nova Scotia, Canada",Atlantic Time (Canada),0
8970,8970,5.703088e+17,1.0000,Customer Service Issue,0.6452,US Airways,retardedlarry,0.0,usairway hung anoth wast hour time suppos book...,"[46.3144754, 11.0480288]",2015-02-24 11:46:28 -0800,,,0
8972,8972,5.703076e+17,1.0000,Customer Service Issue,1.0000,US Airways,Matt_Bernanke,0.0,usairway your kill insid,"[-0.2201641, -78.5123274]",2015-02-24 11:41:43 -0800,Quito,Quito,0
8973,8973,5.703071e+17,0.7020,Flight Attendant Complaints,0.7020,US Airways,jeremyleewhite,0.0,usairway new americanair like new spiritairlin...,"[35.21979387, -80.94498281]",2015-02-24 11:39:45 -0800,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11873,11873,5.677126e+17,0.6716,Late Flight,0.6716,US Airways,sankeshw,0.0,usairway 2pm flight fll phl connect man howev ...,"[33.5945144, -7.6200284]",2015-02-17 07:50:06 -0800,Casablanca,Casablanca,0
11874,11874,5.677102e+17,1.0000,Customer Service Issue,0.6838,US Airways,CharNewsJunkie,0.0,usairway hold gold reserv line 3 hour flight c...,"[40.7587979, -73.9623427]",2015-02-17 07:40:44 -0800,"New York City, New York",Eastern Time (US & Canada),0
11875,11875,5.676980e+17,1.0000,Customer Service Issue,0.6963,US Airways,MarkKersten,0.0,usairway choic pay anoth 50 go airport hope tr...,"[-0.2201641, -78.5123274]",2015-02-17 06:52:12 -0800,Quito,Quito,0
11876,11876,5.676795e+17,1.0000,Customer Service Issue,0.7188,US Airways,DonnyYardas,0.0,usairway reserv hold 2 hour hang upsmh,"[40.7587979, -73.9623427]",2015-02-17 05:38:31 -0800,"New York City, New York",Eastern Time (US & Canada),0


In [36]:
lda_model = LdaMulticore(corpus=corpus,  
    id2word=id2word,
    num_topics=5, 
    random_state=100,
    chunksize=100,
    passes=20,
    iterations=100,
    eval_every= None,
    per_word_topics=True,
    workers=8)
    
# La idea es tener la mejor similitud entre los textos de un topico mientras que se minimiza la similitud con los textos de otros
# La coherencia relaciona la distancia intracluster con la distancia intercluster
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


# Imprime los topicos; por cada topico muestra su id y luego las palabras mas frecuentes con la frecuencia de esa palabra en ese topico
print(lda_model.print_topics())


[(0, '0.038*"jetblu" + 0.031*"unit" + 0.022*"servic" + 0.020*"custom" + 0.017*"bag" + 0.014*"wait" + 0.013*"plane" + 0.012*"gate" + 0.009*"southwestair" + 0.009*"delay"'), (1, '0.056*"usairway" + 0.018*"unit" + 0.016*"flight" + 0.012*"americanair" + 0.012*"seat" + 0.009*"jetblu" + 0.009*"day" + 0.007*"help" + 0.007*"book" + 0.007*"ticket"'), (2, '0.130*"flight" + 0.043*"cancel" + 0.033*"delay" + 0.028*"unit" + 0.021*"flightl" + 0.020*"hour" + 0.017*"late" + 0.016*"miss" + 0.013*"connect" + 0.011*"hr"'), (3, '0.043*"southwestair" + 0.030*"unit" + 0.022*"fli" + 0.019*"airlin" + 0.019*"never" + 0.016*"custom" + 0.015*"worst" + 0.013*"ever" + 0.013*"get" + 0.013*"servic"'), (4, '0.133*"usairway" + 0.032*"hold" + 0.022*"hour" + 0.021*"get" + 0.020*"call" + 0.017*"phone" + 0.015*"flight" + 0.015*"help" + 0.014*"southwestair" + 0.013*"tri"')]


In [39]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis