In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim.models import CoherenceModel

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# load all metadata

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /home/marcos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marcos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/marcos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marcos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [18]:
#Cargamos los datos
#Cogemos las 100 primeras descripciones como prueba
data = pd.read_csv('justice.csv')
x_train = data['facts']
y_train1 = data['first_party_winner']
y_train2 = data['issue_area']

In [4]:
#Elimina signos de puntuacion y todo a minusculas
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(x_train))

In [6]:
#Eliminamos las stop words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_nostops = remove_stopwords(data_words)

In [9]:
lem = WordNetLemmatizer()
ps = PorterStemmer()

In [10]:
#Lematizamos
def lemmatization(texts):
    texts_out = []
    for text in texts:
        lemText = []
        for sent in text:
            stemmed = ps.stem(sent)
            lemText.append(lem.lemmatize(stemmed))
        texts_out.append(lemText)
    return texts_out

data_lemmatized = lemmatization(data_words_nostops)

In [None]:
#Vectorizar
#Topic modeling
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=20, alpha='auto',per_word_topics=True)
print(f"TOPICOS --> {lda.print_topics()}")

In [15]:
vectorized = []
for doc in data_lemmatized:
    bow = corpus[data_lemmatized.index(doc)]
    # Indices de los topicos mas significativos
    indices = [topico[0] for topico in lda[bow]]
    vectorized.append(indices)

print(vectorized)

[[(2, 0.07638781), (0, [11]), (0, [(11, 0.99989355)])], [(2, 0.023068186), (3, [18, 9, 2]), (3, [(2, 0.024859862), (9, 0.0636279), (18, 0.91149706)])], [(2, 0.1645913), (7, [9, 15]), (7, [(9, 2.2861538), (15, 0.7101885)])], [(1, 0.07076745), (5, [18, 9, 8, 15]), (5, [(8, 0.15804464), (9, 0.2545751), (15, 0.013560573), (18, 0.5737663)])], [(2, 0.12657705), (79, [18, 9, 2]), (79, [(2, 0.21750887), (9, 0.22885647), (18, 0.5536223)])], [(2, 0.03110703), (3, [18, 9, 2]), (3, [(2, 0.13067581), (9, 0.1434861), (18, 0.72579)])], [(1, 0.016331885), (3, [18, 2, 9]), (3, [(2, 0.3095133), (9, 0.18531908), (18, 1.5051332)])], [(1, 0.06900072), (4, [15]), (4, [(15, 0.9999841)])], [(2, 0.1351492), (10, [11]), (10, [(11, 0.99939615)])], [(2, 0.08031552), (3, [18, 9, 2]), (3, [(2, 0.17026418), (9, 0.22816212), (18, 0.60154945)])], [(1, 0.33215752), (34, [8]), (34, [(8, 0.9998766)])], [(2, 0.035034344), (5, [18, 8, 9, 15]), (5, [(8, 0.28589424), (9, 0.20764573), (15, 0.010367778), (18, 0.4960256)])], [(