In [1]:
import pandas as pd
from config.path_config import lakefs_s3_path
import os

In [5]:
repo_name = "tweets-repo"
branch_name = "main"
path = "tweets.parquet"
lakefs_s3_path = f"s3://{repo_name}/{branch_name}/{path}"
def data_from_lakefs(lakefs_endpoint: str = "http://localhost:8001/", columns: list[str] = None):
    storage_options = {
        "key": os.getenv("ACCESS_KEY"),
        "secret": os.getenv("SECRET_KEY"),
        "client_kwargs": {
            "endpoint_url": lakefs_endpoint
        }
    }
    df = pd.read_parquet(
        lakefs_s3_path,
        columns=columns,
        storage_options=storage_options,
        engine='pyarrow',
    )
    return df

df = data_from_lakefs()
df = df.rename(columns={'postTimeRaw': 'timestamp'})
df['year'] = df['year'].astype('int32')
df['month'] = df['month'].astype('int32')
df['day'] = df['day'].astype('int32')
df['timestamp'] = df['timestamp'].dt.tz_localize('UTC')
df.to_parquet('../data/data.parquet', engine='pyarrow')
df.dtypes

category           string[python]
tag                string[python]
username           string[python]
tweetText          string[python]
timestamp     datetime64[ns, UTC]
scrapeTime         datetime64[ns]
tweet_link         string[python]
index                       int64
year                        int32
month                       int32
day                         int32
dtype: object

In [6]:
df_verlify = pd.read_parquet('../data/data.parquet', engine='pyarrow')
df_verlify.dtypes

category           string[python]
tag                string[python]
username           string[python]
tweetText          string[python]
timestamp     datetime64[ns, UTC]
scrapeTime         datetime64[ns]
tweet_link         string[python]
index                       int64
year                        int32
month                       int32
day                         int32
dtype: object

In [7]:
# dtype ของ dataframe เก่า vs จาก read มาใหม่
df.dtypes == df_verlify.dtypes

category      True
tag           True
username      True
tweetText     True
timestamp     True
scrapeTime    True
tweet_link    True
index         True
year          True
month         True
day           True
dtype: bool

In [8]:
# Check records > 1000
len(df_verlify) > 1000

True

In [9]:
# Check data duplicated 
df_verlify.duplicated().sum()

np.int64(0)

In [10]:
# Check null
df_verlify.isnull().sum()

category      0
tag           0
username      0
tweetText     0
timestamp     0
scrapeTime    0
tweet_link    0
index         0
year          0
month         0
day           0
dtype: int64