# NLP Exploration Notebook
This notebook was used to setup the NLP workstream. It includes a working implementation of LDA and an incomlete implementation for Word2Vec. <br/>The NLP exploration was not continued because GDELT offered a better solution.
- Latent dirichlet allocation
- Text Classification into fixed categories
- Embed text and build clusers form the embedding space

## Config Setup

In [None]:
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, select
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship

In [None]:
# Define table schema
Base = declarative_base()

class Experiment(Base):
    __tablename__ = 'lda_experiment'
    id = Column('id', Integer, primary_key=True)
    model = Column('model_path', String(100))
    dataset = Column('dataset_path', String(100))
    num_topics = Column('num_topics', Integer)
    epochs = Column('passes', Integer)


In [None]:
# Create session
engine = create_engine("sqlite:///../config/experiment_config.db")

Base.metadata.create_all(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
# Query experiments
q1 = select(Experiment)
q1_result = session.execute(q1) 
for s in q1_result.scalars():
    print(f"{s.model}")

## [Latent Dirichlet Allocation](https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925)


### Load Data

In [None]:
import pandas as pd
import gensim

In [None]:
# Load processed data from csv
dataset_name = 'cnbc_news_dataset_processed'
dataset_path = f'./../../data_engineering/nlp_data/{dataset_name}.csv'
df = pd.read_csv(dataset_path)

# Convert into list of lists
processed_docs = []
for i in list(df.short_description_lemmatized):
    if type(i)==str:
        processed_docs.append(eval(i))
print("Sample datapoint:", processed_docs[0])

#### Data Preperation

In [None]:
# Create a dictionary from 'processed_docs' containing the number of times a word appears in the training
dictionary = gensim.corpora.Dictionary(processed_docs)
print("Dictionary sample: ",list(dictionary.iteritems())[:5])

In [None]:
# Bag-of-words model for each document (dictionary per doc reporting how many words and how many times those words appear)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#### Model Training

In [None]:
num_topics = 10
passes = 5

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,
                                   passes = passes,
                                   workers = 8)

In [None]:
# Save model
model_name = f'lda_model_topics_{num_topics}_passes_{passes}'
model_path = f'./../models/LDA/{model_name}'
lda_model.save(model_path)

# Save configuration
experiment_config = Experiment(
    model = model_name, 
    dataset = dataset_name, 
    num_topics=num_topics, 
    epochs=passes
)
session.add(experiment_config)
session.commit()

### Inference

In [None]:
# Output Topic Representation
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

In [None]:
def make_inference_on_doc(doc):
    # Data preprocessing step for the unseen document
    if type(doc)==str:
        bow_vector = dictionary.doc2bow(eval(doc))
        return lda_model[bow_vector]

In [None]:
df['topic_class'] = df['short_description_lemmatized'].apply(lambda x: make_inference_on_doc(x))
df.head(3)

In [None]:
df_out = df[['published_at','topic_class']]
df_out.to_csv(f"./output_data/data_{model_name}_{dataset_name}.csv")
df_out.head()

## [Top2Vec](https://top2vec.readthedocs.io/en/stable/Top2Vec.html#how-does-it-work)

Das Top2Vec Setup wurde nicht beendet da, sich der Fokus des Modelling-Workstreams, nach integration von GDELT, auf Forecasting gelengt wurde. Für mehr Details, siehe Dokumentation.

In [None]:
from top2vec import Top2Vec
import pandas as pd

In [None]:
# Load processed data from csv
dataset_name = 'cnbc_news_dataset_processed'
dataset_path = f'../../data_engineering/nlp_data/{dataset_name}.csv'
df = pd.read_csv(dataset_path)
documents = list(df.title)

In [None]:
# Define Model with pretrained encoder
model = Top2Vec(documents, embedding_model='universal-sentence-encoder')