In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim.models import CoherenceModel

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# load all metadata

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#Cargamos los datos
#Cogemos las 100 primeras descripciones como prueba
data = pd.read_csv('justice.csv')
x_train = data['facts']
y_train1 = data['first_party_winner']
y_train2 = data['issue_area']

In [3]:
#Elimina signos de puntuacion y todo a minusculas
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(x_train))

In [4]:
#Eliminamos las stop words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_nostops = remove_stopwords(data_words)

In [5]:
lem = WordNetLemmatizer()
ps = PorterStemmer()

In [6]:
#Lematizamos
def lemmatization(texts):
    texts_out = []
    for text in texts:
        lemText = []
        for sent in text:
            stemmed = ps.stem(sent)
            lemText.append(lem.lemmatize(stemmed))
        texts_out.append(lemText)
    return texts_out

data_lemmatized = lemmatization(data_words_nostops)

In [10]:
#Vectorizar
#Topic modeling
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=20, alpha='auto',per_word_topics=True)
#print(f"TOPICOS --> {lda.print_topics()}")

lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, passes=1, eval_every=None)
count = 0
for i in lda_model.print_topics():
    print("Topic:", count, i)
    count += 1

print(corpus[0])
count = 0

for i in lda_model[corpus]:
    print("doc:", count, i)
    count += 1

Topic: 0 (0, '0.033*"court" + 0.015*"district" + 0.013*"appeal" + 0.009*"state" + 0.007*"circuit" + 0.007*"violat" + 0.007*"act" + 0.006*"file" + 0.006*"held" + 0.006*"affirm"')
Topic: 1 (1, '0.032*"court" + 0.012*"appeal" + 0.009*"state" + 0.009*"district" + 0.008*"sentenc" + 0.008*"claim" + 0.007*"circuit" + 0.006*"convict" + 0.006*"juri" + 0.006*"em"')
Topic: 2 (2, '0.043*"court" + 0.014*"appeal" + 0.014*"district" + 0.013*"state" + 0.009*"convict" + 0.008*"feder" + 0.008*"trial" + 0.007*"circuit" + 0.007*"claim" + 0.006*"file"')
Topic: 3 (3, '0.034*"court" + 0.017*"state" + 0.015*"district" + 0.012*"appeal" + 0.010*"claim" + 0.009*"act" + 0.009*"feder" + 0.009*"circuit" + 0.007*"law" + 0.005*"violat"')
Topic: 4 (4, '0.021*"court" + 0.011*"state" + 0.010*"appeal" + 0.009*"em" + 0.006*"district" + 0.006*"act" + 0.006*"circuit" + 0.005*"right" + 0.005*"feder" + 0.005*"violat"')
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), 