In [15]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim.models import CoherenceModel

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# load all metadata

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gorka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Us

In [16]:
#Cargamos los datos
#Cogemos las 100 primeras descripciones como prueba
data = pd.read_csv('justice.csv')
x_train = data['facts']
y_train1 = data['first_party_winner']
y_train2 = data['issue_area']

In [17]:
#Elimina signos de puntuacion y todo a minusculas
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(x_train))

In [18]:
#Eliminamos las stop words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_nostops = remove_stopwords(data_words)

In [19]:
lem = WordNetLemmatizer()
ps = PorterStemmer()

In [20]:
#Lematizamos
def lemmatization(texts):
    texts_out = []
    for text in texts:
        lemText = []
        for sent in text:
            stemmed = ps.stem(sent)
            lemText.append(lem.lemmatize(stemmed))
        texts_out.append(lemText)
    return texts_out

data_lemmatized = lemmatization(data_words_nostops)

In [32]:
#Vectorizar
#Topic modeling
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=20, alpha='auto',per_word_topics=True)
#print(f"TOPICOS --> {lda.print_topics()}")

lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, passes=100, chunksize=100, eval_every=None)
count = 0
for i in lda_model.print_topics():
    print("Topic:", count, i)
    count += 1


Topic: 0 (0, '0.123*"properti" + 0.110*"tax" + 0.056*"bank" + 0.048*"purchas" + 0.036*"market" + 0.030*"ga" + 0.027*"credit" + 0.026*"stock" + 0.018*"revenu" + 0.018*"paid"')
Topic: 1 (1, '0.234*"sentenc" + 0.089*"death" + 0.079*"convict" + 0.072*"juri" + 0.048*"murder" + 0.045*"penalti" + 0.042*"crime" + 0.038*"guilti" + 0.036*"robberi" + 0.026*"home"')
Topic: 2 (2, '0.029*"charg" + 0.021*"drug" + 0.019*"convict" + 0.018*"violat" + 0.017*"jame" + 0.017*"evid" + 0.016*"inform" + 0.016*"offens" + 0.015*"juri" + 0.015*"product"')
Topic: 3 (3, '0.112*"board" + 0.081*"agreement" + 0.075*"union" + 0.070*"collect" + 0.044*"although" + 0.037*"good" + 0.033*"ongo" + 0.028*"recogn" + 0.024*"contract" + 0.024*"exclus"')
Topic: 4 (4, '0.123*"patent" + 0.113*"citi" + 0.090*"child" + 0.027*"social" + 0.027*"licens" + 0.022*"never" + 0.022*"parent" + 0.021*"mother" + 0.019*"commission" + 0.018*"father"')
Topic: 5 (5, '0.186*"offic" + 0.070*"polic" + 0.051*"search" + 0.044*"arrest" + 0.035*"posse" + 

In [34]:
#Hay que cambiar el n_topics por el numero de topics del LDA
n_topics = 20
vectorized = []
count = 0

for i in lda_model[corpus]:
    index = [0]*n_topics
    for n in i:
        index[n[0]] = n[1]
    vectorized.append(index)
print(vectorized)

[[0, 0, 0.10462261, 0, 0, 0, 0, 0.036971785, 0.06498543, 0.2847864, 0, 0.30699164, 0, 0, 0, 0, 0, 0, 0.18735221, 0], [0, 0, 0, 0, 0.51094043, 0, 0, 0.017925622, 0, 0.12746774, 0, 0, 0, 0.07890858, 0, 0.093628265, 0, 0, 0.16019025, 0], [0, 0, 0.11680355, 0, 0, 0.07267137, 0, 0, 0, 0.22756556, 0, 0.061788663, 0.02500145, 0, 0, 0.38427666, 0, 0, 0, 0.09641437], [0, 0.105189994, 0, 0, 0, 0, 0, 0, 0, 0.7042715, 0.036187466, 0, 0, 0, 0, 0, 0, 0, 0.12675792, 0], [0, 0, 0.19038504, 0, 0.040374473, 0.04451272, 0, 0.052262098, 0, 0, 0, 0, 0, 0.30632433, 0, 0.16538122, 0, 0, 0.17575371, 0], [0, 0, 0.030423468, 0.01905182, 0, 0, 0, 0, 0.19594093, 0.15223353, 0, 0.32036522, 0, 0, 0, 0.2073087, 0, 0, 0, 0.0594597], [0, 0.017273482, 0.116971746, 0, 0, 0.12593444, 0, 0, 0.03779687, 0.2691844, 0, 0.010753889, 0, 0.034325793, 0, 0.31710687, 0, 0, 0.057024527, 0], [0, 0.16010886, 0.05720194, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.7523167, 0, 0, 0, 0], [0, 0, 0.13122962, 0, 0.25687072, 0.16132163, 0.036211