## Import data from S3

In [3]:
from utils.db_conn import query
import pandas as pd
import requests

# get all the file names from db
list_ids = [id[0] for id in query('query_video_ids', ['2022-05-01', '2022-05-30'])]

# load the data into a list
data = []
for id in list_ids:
    url = f'https://youtube-joao-crypto.s3.eu-central-1.amazonaws.com/{id}.txt'
    response = requests.get(url)
    for t in response.text.split('\n'):
        # we only want text or sentences with numbers
        if any(map(str.isdigit, t)):
            data.append(t)

data[:5]

['currently sitting at 29',
 '740 dollars so continues to battle',
 'shorting off of the 21 ema targeting',
 '26 000 and as low down as 19 thousand',
 'into that 21 ema on a weekly time frame']

## Save the data to txt

In [4]:
# with open('utils/prodigy/data.txt', 'w') as f:
#     for t in data:
#         f.write(t + '\n')

## Use spacy NER module to clean text

In [5]:
import spacy

nlp = spacy.load("utils/prodigy/model/model-best", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

def clean_text(text):
    text = text.replace('000', '')
    text = text.replace('00', '')
    text = text.replace('k', '')
    text = text.replace('.', '')

    return text

final = []
for line in data:
    doc = nlp(line)
    final.append(" ".join([clean_text(word.text)+'k' if word.ent_type_ else word.text for word in doc]))

final[:5]



['currently sitting at 29k',
 '740 dollars so continues to battle',
 'shorting off of the 21 ema targeting',
 '26k k and as low down as 19k thousand',
 'into that 21 ema on a weekly time frame']

## Remove stopwords

In [6]:
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english'))

## Vectorize data with TFIDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_text = TfidfVectorizer(use_idf=True, min_df=3, max_df=0.8,
                             stop_words=stopwords, ngram_range=(1, 2))
# Fit and transform to our data
# vectors_text is going to be used later in the NMF algorithm
vectors_text = tfidf_text.fit_transform(final)

In [8]:
tfidf = pd.DataFrame(vectors_text[0].T.todense(), index=tfidf_text.get_feature_names(), columns=["TF-IDF"])
# Sort from the more important to least important
tfidf = tfidf.sort_values('TF-IDF', ascending=False)
tfidf



Unnamed: 0,TF-IDF
sitting,0.657189
currently,0.596867
29k,0.460274
00,0.000000
major level,0.000000
...,...
back december,0.000000
back 50,0.000000
back 40k,0.000000
back 31k,0.000000


## Decomposition with NMF

In [9]:
from sklearn.decomposition import NMF

nmf_text_model = NMF(n_components=3, random_state=42)
w_text_matrix = nmf_text_model.fit_transform(vectors_text)
w_text_matrix # rows are documents, columns are topics

array([[2.86460041e-04, 2.74081509e-02, 0.00000000e+00],
       [2.43577931e-05, 5.48998099e-03, 2.29217430e-04],
       [5.96036813e-03, 5.93234869e-03, 7.66166755e-05],
       ...,
       [0.00000000e+00, 7.20930923e-02, 0.00000000e+00],
       [0.00000000e+00, 3.56873555e-02, 7.05529466e-04],
       [0.00000000e+00, 1.89094320e-02, 1.20718259e-03]])

## Topics

In [10]:
dicts = {}
new_list = []

for topic, word_vector in enumerate(nmf_text_model.components_):
    largest = word_vector.argsort()[::-1]
    dicts["Tópico " + str(topic+1)] = new_list

    for i in range(0, 5):
        new_list.append(tfidf_text.get_feature_names()[largest[i]])
        if i == 4:
            new_list = []

df_topicos = pd.DataFrame.from_dict(dicts)
df_topicos



Unnamed: 0,Tópico 1,Tópico 2,Tópico 3
0,200,50,wave
1,week,30k,would
2,moving,2018,wave wave
3,average,bitcoin,yeah
4,200 week,back,came
