# NLP Exploration Notebook
- Latent dirichlet allocation
- Text Classification into fixed categories
- Embed text and build clusers form the embedding space

## Experiments

### Config Setup

In [8]:
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, select
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship

In [9]:
Base = declarative_base()

class Experiment(Base):
    __tablename__ = 'lda_experiment'
    id = Column('id', Integer, primary_key=True)
    model = Column('model_path', String(100))
    dataset = Column('dataset_path', String(100))
    num_topics = Column('num_topics', Integer)
    epochs = Column('passes', Integer)


In [10]:
# engine = create_engine('sqlite:///:memory:', echo=True)
engine = create_engine("sqlite:///config/experiment_config.db")

Base.metadata.create_all(bind=engine)
Session = sessionmaker(bind=engine)
session = Session()

### Query Experiments

In [4]:
q1 = select(Experiment)
q1_result = session.execute(q1) 
for s in q1_result.scalars():
    print(f"{s.model}")

lda_model_topics_5_passes_10
lda_model_topics_10_passes_10
lda_model_topics_50_passes_10
lda_model_topics_100_passes_10
lda_model_topics_500_passes_10
lda_model_topics_1000_passes_10
lda_model_topics_10_passes_20
lda_model_topics_50_passes_20
lda_model_topics_100_passes_20
lda_model_topics_500_passes_20
lda_model_topics_1000_passes_20
lda_model_topics_10_passes_50
lda_model_topics_50_passes_50
lda_model_topics_100_passes_50
lda_model_topics_500_passes_50
lda_model_topics_1000_passes_50


## [Latent Dirichlet Allocation](https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925)


### Load Data

In [5]:
import pandas as pd
import gensim



In [19]:
# Load processed data from csv
dataset_name = 'cnbc_news_dataset_processed'
dataset_path = f'../data_engineering/nlp_data/{dataset_name}.csv'
df = pd.read_csv(dataset_path)
# Convert into list of lists
processed_docs = []
for i in list(df.short_description_lemmatized):
    if type(i)==str:
        processed_docs.append(eval(i))
processed_docs[:1]

[['daily',
  'notebook',
  'mike',
  'santoli',
  'cnbcs',
  'senior',
  'market',
  'commentator',
  'idea',
  'trend',
  'stock',
  'market',
  'statistic']]

#### Data Preperation

In [20]:
# Create a dictionary from 'processed_docs' containing the number of times a word appears in the training
dictionary = gensim.corpora.Dictionary(processed_docs)
list(dictionary.iteritems())[:5]

[(0, 'cnbcs'), (1, 'commentator'), (2, 'daily'), (3, 'idea'), (4, 'market')]

In [21]:
# Bag-of-words model for each document (dictionary per doc reporting how many words and how many times those words appear)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

#### Model Training

In [23]:
num_topics = 1000
passes = 50

In [24]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, 
                                   id2word = dictionary,                                    
                                   passes = passes,
                                   workers = 8)



In [25]:
# Save model and configuration
model_name = f'lda_model_topics_{num_topics}_passes_{passes}'
model_path = f'./models/LDA/{model_name}'
lda_model.save(model_path)
experiment_config = Experiment(
    model = model_name, 
    dataset = dataset_name, 
    num_topics=num_topics, 
    epochs=passes
)
session.add(experiment_config)
session.commit()

### Inference

In [26]:
# for idx, topic in lda_model.print_topics(-1):
#     print("Topic: {} \nWords: {}".format(idx, topic ))
#     print("\n")

In [27]:
def make_inference_on_doc(doc):
    # Data preprocessing step for the unseen document
    if type(doc)==str:
        bow_vector = dictionary.doc2bow(eval(doc))
        return lda_model[bow_vector]

In [28]:
df['topic_class'] = df['short_description_lemmatized'].apply(lambda x: make_inference_on_doc(x))
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,published_at,short_description,keywords,description,title_lowered,title_tokenized,title_removed_stopwords,title_lemmatized,short_description_lowered,short_description_tokenized,short_description_removed_stopwords,short_description_lemmatized,description_lowered,description_tokenized,description_removed_stopwords,description_lemmatized,topic_class
0,0,Santoli’s Wednesday market notes: Could Septem...,2021-09-29T17:09:39+0000,"This is the daily notebook of Mike Santoli, CN...","cnbc, Premium, Articles, Investment strategy, ...","This is the daily notebook of Mike Santoli, CN...",santoli’s wednesday market notes could septemb...,"['santoli', '’', 's', 'wednesday', 'market', '...","['santoli', 'wednesday', 'market', 'notes', 'c...","['santoli', 'wednesday', 'market', 'note', 'co...",this is the daily notebook of mike santoli cnb...,"['this', 'is', 'the', 'daily', 'notebook', 'of...","['daily', 'notebook', 'mike', 'santoli', 'cnbc...","['daily', 'notebook', 'mike', 'santoli', 'cnbc...",this is the daily notebook of mike santoli cnb...,"['this', 'is', 'the', 'daily', 'notebook', 'of...","['daily', 'notebook', 'mike', 'santoli', 'cnbc...","['daily', 'notebook', 'mike', 'santoli', 'cnbc...","[(128, 0.091083094), (266, 0.09107704), (346, ..."
1,1,My take on the early Brexit winners and losers,2016-06-24T13:50:48-0400,This commentary originally ran on Facebook. Bo...,"Articles, Politics, Europe News, European Cent...",,my take on the early brexit winners and losers,"['my', 'take', 'on', 'the', 'early', 'brexit',...","['take', 'early', 'brexit', 'winners', 'losers']","['take', 'early', 'brexit', 'winner', 'loser']",this commentary originally ran on facebook bor...,"['this', 'commentary', 'originally', 'ran', 'o...","['commentary', 'originally', 'ran', 'facebook'...","['commentary', 'originally', 'ran', 'facebook'...",,,,,"[(402, 0.06519049), (444, 0.91871387)]"
2,2,Europe's recovery depends on Renzi's Italy,2014-03-25T13:29:45-0400,"In spring, ambitious reforms began in Italy. U...","Articles, Business News, Economy, Europe Econo...",,europes recovery depends on renzis italy,"['europes', 'recovery', 'depends', 'on', 'renz...","['europes', 'recovery', 'depends', 'renzis', '...","['europe', 'recovery', 'depends', 'renzis', 'i...",in spring ambitious reforms began in italy und...,"['in', 'spring', 'ambitious', 'reforms', 'bega...","['spring', 'ambitious', 'reforms', 'began', 'i...","['spring', 'ambitious', 'reform', 'began', 'it...",,,,,"[(14, 0.07672071), (78, 0.033589777), (122, 0...."


In [29]:
df_out = df[['published_at','topic_class']]
df_out.to_csv(f"./output_data/data_{model_name}_{dataset_name}.csv")
df_out.head()

Unnamed: 0,published_at,topic_class
0,2021-09-29T17:09:39+0000,"[(128, 0.091083094), (266, 0.09107704), (346, ..."
1,2016-06-24T13:50:48-0400,"[(402, 0.06519049), (444, 0.91871387)]"
2,2014-03-25T13:29:45-0400,"[(14, 0.07672071), (78, 0.033589777), (122, 0...."
3,2009-04-22T19:49:03+0000,"[(5, 0.19374597), (227, 0.074674495), (253, 0...."
4,2018-04-14T14:59:04+0000,


## [Top2Vec](https://top2vec.readthedocs.io/en/stable/Top2Vec.html#how-does-it-work)

In [2]:
# from top2vec import Top2Vec
import pandas as pd

In [11]:
# Load processed data from csv
dataset_name = 'cnbc_news_dataset_processed'
dataset_path = f'../../data_engineering/nlp_data/{dataset_name}.csv'
df = pd.read_csv(dataset_path)
documents = list(df.title)

In [14]:
# Define Model with pretrained encoder
model = Top2Vec(documents, embedding_model='universal-sentence-encoder')

ImportError: universal-sentence-encoder is not available.

Try: pip install top2vec[sentence_encoders]

Alternatively try: pip install tensorflow tensorflow_hub tensorflow_text