# Modelling Exploration Notebook
- Latent dirichlet allocation
- Text Classification into fixed categories
- Embed text and build clusers form the embedding space

## Experiments

### Config Setup

In [59]:
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, select
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship

In [60]:
Base = declarative_base()

class Experiment(Base):
    __tablename__ = 'lda_experiment'
    id = Column('id', Integer, primary_key=True)
    model = Column('model_path', String(100))
    dataset = Column('dataset_path', String(100))
    num_topics = Column('num_topics', Integer)
    passes = Column('passes', Integer)


In [61]:
# engine = create_engine('sqlite:///:memory:', echo=True)
engine = create_engine("sqlite:///experiment_config.db") # To store database persistently

Base.metadata.create_all(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()

### Query Experiments

In [62]:
q1 = select(Experiment).where(Experiment.dataset=='solar')
q1_result = session.execute(q1) 
for s in q1_result.scalars():
    print(f"{s.dataset} {s.configuration_2}")

## Load Dataset

In [63]:
import pandas as pd
import gensim

In [64]:
# Load processed data from csv
dataset_name = '../data_engineering/nlp_data/cnbc_news_dataset_processed.csv'
df = pd.read_csv(dataset_name)
# Convert into list of lists
processed_docs = []
for i in list(df.short_description_lemmatized):
    if type(i)==str:
        processed_docs.append(eval(i))
processed_docs[:1]

[['daily',
  'notebook',
  'mike',
  'santoli',
  'cnbcs',
  'senior',
  'market',
  'commentator',
  'idea',
  'trend',
  'stock',
  'market',
  'statistic']]

## [Latent Dirichlet Allocation](https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925)


#### Data Preperation

In [65]:
# Create a dictionary from 'processed_docs' containing the number of times a word appears in the training
dictionary = gensim.corpora.Dictionary(processed_docs)
list(dictionary.iteritems())[:5]

[(0, 'cnbcs'), (1, 'commentator'), (2, 'daily'), (3, 'idea'), (4, 'market')]

In [66]:
# Bag-of-words model for each document (dictionary per doc reporting how many words and how many times those words appear)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#### Model Training

In [67]:
num_topics = 5
passes = 10

In [68]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,                                    
                                   passes = passes,
                                   workers = 8)



In [71]:
# Save model and configuration
model_path = f'./models/LDA/lda_model_topics_{num_topics}_passes_{passes}'
lda_model.save(model_path)
experiment_config = Experiment(model = model_path, dataset = dataset_name, num_topics=num_topics, passes=passes)
session.add(experiment_config)
session.commit()

FileNotFoundError: [Errno 2] No such file or directory: './models/LDA/lda_model_topics_5_passes_10.state'

### Inference

In [57]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.006*"percent" + 0.006*"u" + 0.004*"billion" + 0.004*"bank" + 0.003*"said" + 0.003*"week" + 0.003*"market" + 0.003*"12" + 0.003*"change" + 0.003*"year"


Topic: 1 
Words: 0.010*"said" + 0.010*"percent" + 0.006*"year" + 0.006*"rate" + 0.005*"u" + 0.005*"market" + 0.005*"bank" + 0.004*"month" + 0.004*"new" + 0.004*"would"


Topic: 2 
Words: 0.011*"company" + 0.010*"said" + 0.005*"people" + 0.004*"u" + 0.004*"new" + 0.004*"year" + 0.003*"business" + 0.003*"million" + 0.003*"technology" + 0.003*"sale"


Topic: 3 
Words: 0.009*"market" + 0.008*"long" + 0.008*"said" + 0.007*"stock" + 0.005*"u" + 0.005*"year" + 0.004*"state" + 0.004*"money" + 0.004*"investor" + 0.004*"new"


Topic: 4 
Words: 0.009*"percent" + 0.008*"said" + 0.008*"stock" + 0.007*"market" + 0.006*"oil" + 0.005*"price" + 0.005*"year" + 0.005*"company" + 0.004*"u" + 0.004*"cramer"


Topic: 5 
Words: 0.009*"said" + 0.005*"trump" + 0.005*"one" + 0.005*"would" + 0.004*"cnbc" + 0.004*"year" + 0.004*"think" + 0.004*

In [58]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
# Make inference on unseen document
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

NameError: name 'preprocess' is not defined