In [27]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px

In [2]:
PATH_PETR4 = 'data/PETR4'

In [3]:
RELEVANT_FIELDS = [
    'type',
    'id',
    'text',
    'retweetCount',
    'replyCount',
    'likeCount',
    'createdAt',
    'bookmarkCount',
    'isReply',
    # author fields:
    'author' # [type, userName, name, isVerified, description, followers, following, createdAt, favouritesCount]
]

RELEVANT_AUTHOR_FIELDS = [
    'type',
    'userName',
    'name',
    'isVerified',
    'description',
    'followers',
    'following',
    'createdAt',
    'favouritesCount'
]

In [18]:
dict_list = []

for filename in os.listdir(PATH_PETR4):
    with open(os.path.join(PATH_PETR4, filename), 'rt') as f:
        curr_json_list = json.load(f)

    for curr_json in curr_json_list:
        relevant_json = {k:v for k,v in curr_json.items() if k in RELEVANT_FIELDS}
        relevant_json_author = {f'author_{k}':v for k,v in relevant_json['author'].items() if k in RELEVANT_AUTHOR_FIELDS}

        del relevant_json['author']

        new_dict = {**relevant_json, **relevant_json_author}
        new_dict['src_file'] = filename
        dict_list.append(new_dict)


In [19]:
df = pd.DataFrame(dict_list)

In [21]:
df['createdAt_datetime'] = pd.to_datetime(df['createdAt'], format='%a %b %d %H:%M:%S %z %Y')

In [26]:
df[['text', 'createdAt_datetime']]

Unnamed: 0,text,createdAt_datetime
0,"Não só a máxima do ano, mas #PETR4 teve a seu ...",2023-12-26 22:21:39+00:00
1,Como o gestor do fundo “IBIUNA LONG BIASED FIM...,2023-12-18 17:12:45+00:00
2,Anual de #PETR4 https://t.co/Yxa38SOuTM,2023-12-27 20:25:53+00:00
3,Patrocinado por futuro breve short em Ibovespa...,2023-12-31 16:55:28+00:00
4,#PETR4 *gráfico não ajustado! Há 3 anos que ro...,2023-12-30 13:29:27+00:00
...,...,...
2395,"Ibovespa: Em dia de queda generalizada, PETR4 ...",2022-08-30 20:56:53+00:00
2396,Não foi dessa vez que a Petrobras (#PETR4) con...,2022-08-28 17:01:49+00:00
2397,"Ibovespa cai 1,68% com pressão sobre as duas e...",2022-08-30 20:48:41+00:00
2398,TELEPROMPTER \nA Petrobras (PETR4) teve a vend...,2022-08-30 17:21:11+00:00


In [33]:
df_dt = df.set_index('createdAt_datetime')

In [42]:
df_dt.resample('1D').count()

createdAt_datetime
2022-06-20 00:00:00+00:00     2
2022-06-21 00:00:00+00:00     3
2022-06-22 00:00:00+00:00     4
2022-06-23 00:00:00+00:00     3
2022-06-24 00:00:00+00:00     5
                             ..
2024-05-27 00:00:00+00:00    10
2024-05-28 00:00:00+00:00    22
2024-05-29 00:00:00+00:00     5
2024-05-30 00:00:00+00:00     1
2024-05-31 00:00:00+00:00    22
Freq: D, Name: type, Length: 712, dtype: int64

In [35]:
df_dt.resample('1D')

<pandas.core.resample.DatetimeIndexResampler object at 0x76344dea8be0>

In [32]:
df['createdAt_datetime'].dt.strftime('%Y/%m/%d')

0       2023/12/26
1       2023/12/18
2       2023/12/27
3       2023/12/31
4       2023/12/30
           ...    
2395    2022/08/30
2396    2022/08/28
2397    2022/08/30
2398    2022/08/30
2399    2022/08/24
Name: createdAt_datetime, Length: 2400, dtype: object

In [43]:
px.line(df_dt.resample('1D').count(), y='type')

In [40]:
from pandas_datareader import data as pdr
from datetime import datetime
import yfinance as yf
yf.pdr_override()
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 4, 25)
df = pdr.get_data_yahoo("PETR4.SA", start=start_date, end=end_date)
df.index = pd.to_datetime(df.index) # indice nao vem como datetime
df.tail()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-04-18,40.0,40.59,39.650002,39.849998,37.15099,45078800
2024-04-19,40.049999,41.07,39.84,40.529999,37.784931,80546900
2024-04-22,40.759998,41.59,40.52,41.5,38.689236,51775500
2024-04-23,41.400002,41.630001,40.959999,41.419998,38.614651,35456900
2024-04-24,41.549999,41.990002,41.209999,41.23,38.437523,45388300
