From 9b1655978c494a82929e66c398af457d6a4e754d Mon Sep 17 00:00:00 2001
From: thelazyliz <elizabeth.ykw@gmail.com>
Date: Sun, 12 May 2024 09:17:17 +0100
Subject: [PATCH] Added new blocks and refresh prompts (#11)

---
 dune/loader.py            |   1 +
 dune/process.py           | 229 -----------------------
 dune/queries.py           |  14 +-
 graphing/graph.py         | 142 +++++++-------
 llm/blocks.py             |  13 +-
 llm/prompts.py            | 201 +++++++++++++-------
 main.py                   |  10 +-
 utils/__init__.py         |   0
 utils/cex_data_loader.py  | 381 ++++++++++++++++++++++++++++++++++++++
 utils/data_transformer.py | 307 ++++++++++++++++++++++++++++++
 10 files changed, 931 insertions(+), 367 deletions(-)
 delete mode 100644 dune/process.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/cex_data_loader.py
 create mode 100644 utils/data_transformer.py

diff --git a/dune/loader.py b/dune/loader.py
index 25a344a..cbbcd95 100644
--- a/dune/loader.py
+++ b/dune/loader.py
@@ -14,6 +14,7 @@ def get_query_result(queries, query_name, cluster="medium"):
     while attempts < 3:
         try:
             pd = dune.refresh_into_dataframe(query, performance=cluster)
+            # pd = dune.get_latest_result_dataframe(query.query_id)
             return pd
         except Exception as err:
             attempts += 1
diff --git a/dune/process.py b/dune/process.py
deleted file mode 100644
index 31e578a..0000000
--- a/dune/process.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import pandas as pd
-
-
-def process_tvl(df: pd.DataFrame) -> str:
-    # Select the row corresponding to 'Total'
-    total_row = df[df["chain"] == "Total"]
-
-    # Extract the 'TVL' and 'TVL change, %' from the total row
-    total_tvl = total_row["TVL"].values[0]
-    total_tvl_change = total_row["TVL change, %"].values[0] * 100
-
-    # Format the total TVL and total TVL change into strings
-    total_tvl_str = f"${total_tvl / 1e9:.2f}b"
-    total_tvl_change_str = f"{total_tvl_change:.2f}%"
-
-    # Select the rows corresponding to 'Ethereum' and 'Polygon'
-    eth_row = df[df["chain"] == "Ethereum"]
-    polygon_row = df[df["chain"] == "Polygon"]
-
-    # Extract the 'Token price change, %' from the Ethereum and Polygon rows
-    eth_price_change = float(eth_row["Token price change, %"].values[0]) * 100
-    polygon_price_change = float(polygon_row["Token price change, %"].values[0]) * 100
-
-    # Format the Ethereum and Polygon price changes into strings
-    eth_price_change_str = f"{eth_price_change:.2f}%"
-    polygon_price_change_str = f"{polygon_price_change:.2f}%"
-
-    # Combine everything into a result string
-    result_string = (
-        f"TVL: {total_tvl_str}\n"
-        f"TVL Percentage Change: {total_tvl_change_str}\n"
-        f"Ethereum Token Price Change: {eth_price_change_str}\n"
-        f"Polygon Token Price Change: {polygon_price_change_str}"
-    )
-
-    return result_string
-
-
-def process_netDepositGrowthLeaders(df: pd.DataFrame) -> str:
-    # Calculate rank by net deposit growth
-    df = df.sort_values("eth_deposits_growth", ascending=False)
-    df["rank"] = range(1, len(df) + 1)
-
-    # Find Lido's stats
-    lido_stats = df[df["name"] == "Lido"]
-
-    # If Lido is not in the list, return None for both values
-    if lido_stats.empty:
-        return ""
-
-    lido_net_deposit_growth = round(lido_stats.iloc[0]["eth_deposits_growth"], 0)
-    lido_rank = lido_stats.iloc[0]["rank"]
-
-    return f"Lido had net deposit growth of {lido_net_deposit_growth} ETH. ETH Growth Leaderboard rank: {lido_rank}"
-
-
-def process_stETHApr(df: pd.DataFrame) -> str:
-    # Get the most recent 7d moving average
-    recent_7d_ma = df["stakingAPR_ma_7"].values[0]
-
-    # Format the result into a string
-    result_string = f"7d MA: {recent_7d_ma:.2%}"
-
-    return result_string
-
-
-def process_stEthToEth(df: pd.DataFrame) -> str:
-    # Convert 'time' column to datetime
-    df["time"] = pd.to_datetime(df["time"])
-
-    # Sort DataFrame by time in descending order
-    df = df.sort_values("time", ascending=False)
-
-    # Get the most recent 'weight_avg_price'
-    recent_price = df["weight_avg_price"].iloc[0]
-
-    # Calculate the volatility of the 'weight_avg_price'
-    volatility = df["weight_avg_price"].std()
-
-    # Return results as a string
-    return f"The week ended with a ratio of {recent_price}. The ratio over the week had a standard deviation of {volatility}"
-
-
-def process_dexLiquidityReserves(df: pd.DataFrame) -> str:
-    # Select the row corresponding to 'total'
-    total_row = df[df["token"] == "total"]
-
-    # Extract the 'end value' and 'period_change' from this row
-    end_value = total_row["end value"].values[0]
-    period_change = total_row["period_change"].values[0]
-
-    # Format the end value into a string in billions with 2 decimal places
-    end_value_str = f"${end_value / 1e9:.0f}b"
-
-    # Format the period change into a string as a percentage with 2 decimal places
-    period_change_str = f"{period_change * 100:.2f}%"
-
-    # Combine these into a result string
-    result_string = f"Total End Value: {end_value_str}\nPeriod Change: {period_change_str}"
-
-    return result_string
-
-
-def process_totalStEthInDeFi(df: pd.DataFrame) -> str:
-
-    # Order the dataframe from oldest to latest
-    df = df.sort_values("time")    
-
-    # Calculate the changes
-    liquidity_pools_diff = df["liquidity_pools"].diff().dropna()
-    lending_pools_diff = df["lending_pools"].diff().dropna()
-    stETH_in_DeFi_diff = df["stETH_in_DeFi"].diff().dropna()
-
-    # Get the total changes
-    total_liquidity_pools_change = liquidity_pools_diff.sum()
-    total_lending_pools_change = lending_pools_diff.sum()
-    total_stETH_in_DeFi_change = stETH_in_DeFi_diff.sum()
-
-    # Get the total percentage changes
-    total_liquidity_pools_pct_change = (
-        (df["liquidity_pools"].iloc[-1] - df["liquidity_pools"].iloc[0]) / df["liquidity_pools"].iloc[0] * 100
-    )
-    total_lending_pools_pct_change = (
-        (df["lending_pools"].iloc[-1] - df["lending_pools"].iloc[0]) / df["lending_pools"].iloc[0] * 100
-    )
-    total_stETH_in_DeFi_pct_change = (
-        (df["stETH_in_DeFi"].iloc[-1] - df["stETH_in_DeFi"].iloc[0]) / df["stETH_in_DeFi"].iloc[0] * 100
-    )
-
-    # Get the final (most recent) value of stETH in DeFi
-    final_stETH_in_DeFi = df["stETH_in_DeFi"].iloc[-1]
-
-    # Get the final (most recent) percentage of stETH_DeFi_share
-    final_stETH_DeFi_share = df["stETH_DeFi_share"].iloc[-1]
-
-    # Format the changes into a string
-    result = f"""
-    Liquidity Pools:
-    Absolute change: {total_liquidity_pools_change:.0f}
-    Percentage change: {total_liquidity_pools_pct_change:.2f}%
-
-    Lending Pools:
-    Absolute change: {total_lending_pools_change:.0f}
-    Percentage change: {total_lending_pools_pct_change:.2f}%
-
-    Total stETH in DeFi:
-    Absolute change: {total_stETH_in_DeFi_change:.0f}
-    Percentage change: {total_stETH_in_DeFi_pct_change:.2f}%
-    Final value: {final_stETH_in_DeFi:.0f}
-
-    Percentage of stETH in DeFi: {final_stETH_DeFi_share:.2f}%
-    """
-
-    return result
-
-
-def process_stEthOnL2(df: pd.DataFrame) -> str:
-    # Select the row corresponding to 'total'
-    total_row = df[df["bridge"] == "total"]
-
-    # Extract the 'end_amount' and 'period_change' from the total row
-    total_end_amount = total_row["end_amount"].values[0]
-    total_period_change = total_row["period_change"].values[0]
-
-    # Format the total end amount and total period change into strings
-    total_end_amount_str = f"{round(total_end_amount, 0)} wstETH"
-    total_period_change_str = f"{total_period_change:.2f}%"
-
-    # Initialize an empty string to store the individual bridge data
-    bridge_data_str = ""
-
-    # Loop over the rows of the DataFrame
-    for i, row in df.iterrows():
-        # Skip the total row
-        if row["bridge"] == "total":
-            continue
-
-        # Extract the 'bridge', 'end_amount' and 'period_change' for each row
-        bridge = row["bridge"]
-        end_amount = row["end_amount"]
-        period_change = row["period_change"]
-
-        # Format the end amount and period change into strings
-        end_amount_str = f"{end_amount:.0f} wstETH"
-        period_change_str = f"{period_change:.2f}%"
-
-        # Append this bridge's data to the bridge data string
-        bridge_data_str += f"{bridge}: {end_amount_str} (7d: {period_change_str})\n"
-
-    # Combine the total and bridge data into a result string
-    result_string = f"The amount of wstETH on L2 changed by {total_period_change_str}, hitting {total_end_amount_str}:\n\n{bridge_data_str}"
-
-    return result_string
-
-
-def process_bridgeChange(df):
-    # Apply rounding logic to the 'period_change' column
-    df['period_change'] = df['period_change'].apply(lambda x: round(x, 2))
-    df['end_amount'] = df['end_amount'].apply(lambda x: int(round(x, 0)))
-    df['start_amount'] = df['start_amount'].apply(lambda x: int(round(x, 0)))
-
-    # Convert the DataFrame to a string
-    df_string = df.to_string(index=False)
-
-    return df_string
-
-
-# Define a dictionary mapping the DataFrame names to their respective processing functions
-process_functions = {
-    "tvl": process_tvl,
-    "netDepositGrowthLeaders": process_netDepositGrowthLeaders,
-    "stETHApr": process_stETHApr,
-    "stEthToEth": process_stEthToEth,
-    # "dexLiquidityReserves": process_dexLiquidityReserves,
-    "bridgeChange": process_bridgeChange,
-    "totalStEthInDeFi": process_totalStEthInDeFi,
-}
-
-
-def process_dune(dune_results: dict[str, pd.DataFrame]) -> dict[str, str]:
-    res = {}
-
-    for df_name, df in dune_results.items():
-        process_func = process_functions.get(df_name)
-        if process_func is not None:
-            s = process_func(df)
-            res[df_name] = s
-
-    return res
diff --git a/dune/queries.py b/dune/queries.py
index 5bcf85a..fbc6822 100644
--- a/dune/queries.py
+++ b/dune/queries.py
@@ -63,11 +63,23 @@ def get_queries(start_date: str, end_date: str, sol_start_deposits: float, sol_e
         ),
         "totalStEthInDeFi": QueryBase(
             name="totalStEthInDeFi",
-            query_id=2740414,
+            query_id=2740000,
             params=[
                 QueryParameter.date_type("digest_start_date", start_date),
                 QueryParameter.date_type("digest_end_date", end_date),
             ],
         ),
+        "bridgedToCosmos": QueryBase(
+            name="bridgedToCosmos",
+            query_id=3215855,
+            params=[
+                QueryParameter.date_type("digest_start_date", start_date),
+                QueryParameter.date_type("digest_end_date", end_date),
+            ],
+        ),
+        "stethVolumes": QueryBase(
+            name="stethVolumes",
+            query_id=2810114
+        ),
     }
     return queries
diff --git a/graphing/graph.py b/graphing/graph.py
index f19eb61..31940f5 100644
--- a/graphing/graph.py
+++ b/graphing/graph.py
@@ -11,7 +11,8 @@
 
 
 class Grapher:
-    def __init__(self, end_date: str):
+    def __init__(self, start_date: datetime, end_date: datetime):
+        self.start_date = start_date
         self.end_date = end_date
         self.graphing_functions = {
             "netDepositGrowthLeaders": self.graph_netDepositGrowthLeaders,
@@ -20,6 +21,8 @@ def __init__(self, end_date: str):
             "stEthToEth": self.graph_stEthToEth,
             "tvl": self.graph_tvl,
             "totalStEthInDeFi": self.graph_totalStEthInDeFi,
+            "bridgedToCosmos": self.graph_bridgedToCosmos,
+            "stethVolumes": self.graph_stethVolumes
             # "dexLiquidityReserves": self.graph_dexLiquidityReserves,
         }
         self.graph_location = f"/tmp/digest/{end_date}/graphs"
@@ -65,69 +68,29 @@ def graph_dexLiquidityReserves(self, df: pd.DataFrame,):
 
     def graph_totalStEthInDeFi(self, df: pd.DataFrame):
 
-        original_df = df.copy()
-        # Convert time column to datetime format and set it as the index
-        df["time"] = pd.to_datetime(df["time"])
-        df.set_index("time", inplace=True)
-
-        # Create a figure and axis for the plot
-        fig, ax = plt.subplots(figsize=(10, 7))
-
-        # Create the line plot
-        sns.lineplot(data=df, x=df.index, y="stETH_DeFi_share", ax=ax, color="blue")
-
-        # Set the x-axis formatter to display date in "Month Day" format
-        ax.xaxis.set_major_formatter(DateFormatter("%B %d"))
+        # Order the dataframe from oldest to latest
+        df = df.sort_values("time")
 
-        # Rotate x-axis labels for better visibility
-        plt.xticks(rotation=45)
+        # Get the latest row, that is the data at the end_date
+        latest_row = df.iloc[-1]
 
-        # Set plot title and labels
-        ax.set_title("Total (w)stETH in DeFi")
-        ax.set_xlabel("")
-        ax.set_ylabel("")
+        # Extract the data that we need
+        lending_balance = f"{latest_row['lending_pools_balance']:,.0f}"
+        liquidity_balance = f"{latest_row['liquidity_pools_balance']:,.0f}"
+        lending_pct_change = f"{float(latest_row['lending_pct_change']):.2f}"
+        liquidity_pct_change = f"{float(latest_row['liquidity_pct_change']):.2f}"
 
-        self.save_figure(fig, "totalStEthInDeFi")
+        # Reshape df into desired structure
+        reshaped_df = pd.DataFrame({
+            'Type of pools': ['Lending', 'Liquidity'],
+            'stETH deposited': [lending_balance, liquidity_balance],
+            'Change, %': [lending_pct_change, liquidity_pct_change]
+        })
 
-        def extract_relevant_data(df):
-            """
-            Takes in a DataFrame and returns a table with specific columns for the latest data point 
-            and the data point 7 days before that.
-            """
-            # Convert 'time' to datetime format for easier manipulation
-            df['time'] = pd.to_datetime(df['time'])
-            
-            # Sort the DataFrame by 'time' in ascending order
-            df.sort_values('time', inplace=True)
-            
-            # Get the latest data point and the data point 7 days before that
-            latest_data = df.iloc[-1]
-            seven_days_before = df[df['time'] <= (latest_data['time'] - pd.Timedelta(days=7))].iloc[-1]
-            
-            # Create a new DataFrame to store these two rows
-            result_df = pd.DataFrame([latest_data, seven_days_before])
-            
-            # Keep only the relevant columns
-            result_df = result_df[['time', 'liquidity_pools', 'lending_pools', 'stETH_in_DeFi', 'stETH_DeFi_share']]
-            
-            return result_df
-        
         def format_and_save_table(df):
             """
             Formats and saves a DataFrame as a table image.
             """
-            # Replace NaN and other null-like values with an empty string
-            df.replace(["<nill>", "nil", "undefined", "null", np.nan], "", inplace=True)
-            
-            # Format the values in the table
-            for col in df.columns:
-                if col == "liquidity_pools" or col == "lending_pools" or col == "stETH_in_DeFi":
-                    df[col] = df[col].apply(lambda x: "{:,.0f}".format(x))  # Express in whole numbers with comma separators
-                elif col == "stETH_DeFi_share":
-                    df[col] = df[col].apply(lambda x: "{:.2f}%".format(x))  # Express as percentage
-                elif col == "time":
-                    df[col] = df[col].dt.strftime('%Y-%m-%d')  # Format datetime to string
-                
             # Create a figure and a subplot
             fig, ax = plt.subplots(figsize=(12, 2))
             
@@ -135,7 +98,7 @@ def format_and_save_table(df):
             ax.axis("off")
             
             # Create the table and scale it to the subplot
-            table = plt.table(cellText=df.values, cellLoc="center", loc="center", colLabels=[col.replace('_', ' ').title() for col in df.columns])
+            table = plt.table(cellText=df.values, cellLoc="center", loc="center", colLabels=df.columns)
             table.auto_set_font_size(False)
             table.set_fontsize(10)
             table.scale(1, 1.5)
@@ -144,14 +107,12 @@ def format_and_save_table(df):
             table.auto_set_column_width(col=list(range(len(df.columns))))
             for key, cell in table.get_celld().items():
                 if key[0] == 0:
-                    cell.set_fontsize(14)
-                    cell.set_facecolor('gray')
+                    cell.get_text().set_fontweight('bold')
+                    cell.set_facecolor('whitesmoke')
             
             self.save_figure(fig, "totalStEthInDeFi_table")
 
-        extracted_data = extract_relevant_data(original_df.copy())
-
-        format_and_save_table(extracted_data.copy())
+        format_and_save_table(reshaped_df)
 
             
 
@@ -320,7 +281,7 @@ def graph_stEthOnL2Bridges(self, df: pd.DataFrame):
         df["day"] = pd.to_datetime(df["day"])
 
         # Filter out the data after July 16, 2023
-        df = df[df["day"] <= datetime.strptime(self.end_date, "%Y-%m-%d %H:%M:%S")]
+        df = df[df["day"] <= self.end_date]
 
         # Set 'day' as the index
         df.set_index("day", inplace=True)
@@ -363,6 +324,61 @@ def format_date(x, pos=None):
 
         self.save_figure(fig, "stEthOnL2Bridges")
 
+    def graph_bridgedToCosmos(self, df: pd.DataFrame):
+        # Convert the 'day' column to datetime format
+        df["day"] = pd.to_datetime(df["day"])
+
+        # Reset default settings
+        plt.rcdefaults()
+
+        # Plot area chart
+        fig, ax = plt.subplots(figsize=(12,6))
+
+        ax.stackplot('day', 'balance_cumu', data=df[['day', 'balance_cumu']])
+
+        # Set plot title
+        ax.set_title("wstETH on Cosmos bridge over time", fontsize=16)
+
+        # Remove axis labels
+        ax.set_xlabel("")
+        ax.set_ylabel("")
+
+        # Define a function to format the date
+        def format_date(x, pos=None):
+            return mdates.num2date(x).strftime("%B %-d")
+        
+        # Set the date formatter for the x-axis
+        ax.xaxis.set_major_formatter(mtick.FuncFormatter(format_date))
+
+        # # Rotate date labels slightly
+        plt.xticks(rotation=30)
+
+        self.save_figure(plt, "bridgedToCosmos")
+
+    def graph_stethVolumes(self, df: pd.DataFrame):
+
+        df = df[(pd.to_datetime(df.index) <= self.end_date) & (pd.to_datetime(df.index) >= self.start_date)]
+
+        # Reset default settings
+        plt.rcdefaults()
+
+        fig, ax = plt.subplots(figsize=(10,8))
+
+        # sum across all dates
+        category_sums = df.sum()
+
+        # create pie chart
+        patches, labels, pct_texts = ax.pie(
+            category_sums, labels=category_sums.index, autopct='%1.1f%%', startangle=140, pctdistance=0.8, wedgeprops=dict(width=0.5), rotatelabels=True)
+        
+        # rotate the % as well
+        for label, pct_text in zip(labels, pct_texts):
+            pct_text.set_rotation(label.get_rotation())
+
+        plt.title('stETH volume aggregated by chain')
+        plt.tight_layout()
+        self.save_figure(plt, "stethVolumes")
+
     def process_all(self, dune_dataframes: dict[str, pd.DataFrame]):
         for df_name, df in dune_dataframes.items():
             graph_func = self.graphing_functions.get(df_name)
diff --git a/llm/blocks.py b/llm/blocks.py
index d5323ba..15932af 100644
--- a/llm/blocks.py
+++ b/llm/blocks.py
@@ -10,6 +10,8 @@
     stEthToEth_prompt,
     dexLiquidityReserves_prompt,
     totalStEthInDeFi_prompt,
+    bridgedToCosmos_prompt,
+    stethVolumes_prompt
 )
 from datetime import datetime
 from langchain.chat_models.openai import ChatOpenAI
@@ -27,12 +29,14 @@ def __init__(self, end_date: str, start_date: str):
             "dexLiquidityReserves": self.write_dexLiquidityReserves,
             "bridgeChange": self.write_bridgeChange,
             "totalStEthInDeFi": self.write_totalStEthInDeFi,
+            "bridgedToCosmos": self.write_bridgedToCosmos,
+            "stethVolumes": self.write_stethVolumes
         }
 
     def write_block(self, processed_input: str, system_prompt: str) -> str:
         today = datetime.today().strftime("%B %d %Y")
 
-        chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")  # type: ignore
+        chat = ChatOpenAI(temperature=0, model="gpt-4-turbo-2024-04-09")  # type: ignore
         thread = chat.predict_messages(
             [
                 SystemMessage(content=system_prompt.format(DATE=today) + "\n" + block_append_prompt),
@@ -60,10 +64,15 @@ def write_dexLiquidityReserves(self, processed):
 
     def write_bridgeChange(self, processed_bridge_change):
         return self.write_block(processed_bridge_change, stEthOnL2Bridges_prompt)
+    
+    def write_bridgedToCosmos(self, processed):
+        return self.write_block(processed, bridgedToCosmos_prompt)
 
     def write_totalStEthInDeFi(self, processed):
-        print(processed)
         return self.write_block(processed, totalStEthInDeFi_prompt)
+    
+    def write_stethVolumes(self, processed):
+        return self.write_block(processed, stethVolumes_prompt)
 
     def compose_thread(self, processed: dict[str, str]):
         print(processed)
diff --git a/llm/prompts.py b/llm/prompts.py
index 52cddba..ca2f5a9 100644
--- a/llm/prompts.py
+++ b/llm/prompts.py
@@ -4,11 +4,15 @@
 ---
 ⚡️ stETH APR
 
-stETH APR saw an increase last week, with the 7d MA reaching 4.26%. 
+The 7d stETH APR is 36 basis points down to 3.18%.
 ---
 ⚡️ stETH APR
 
-The stETH APR has not experienced significant change with the 7d MA sitting at 4.04%.
+The 7d stETH APR decreased 14bp last week to 3.03%.
+---
+⚡️ stETH APR
+
+The 7d stETH APR is 8 basis points up to 3.29%.
 ---
 
 Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about stETH APR.
@@ -16,39 +20,53 @@
 
 tvl_prompt = """
 You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
-Today, you are responsible for writing the section of the thread about Total Value Locked (TVL). It's important to note that typically, the price of Ethereum and other tokens are positively correlated with Lido's TVL. However, there are instances where this trend does not hold. For example, when token prices increase and Lido's TVL decreases, or when token prices decrease and Lido's TVL increases. The latter scenario indicates strong inflows into Lido, showcasing robust investor confidence, which is highly positive. Here are some examples:
+Today, you are responsible for writing the section of the thread about Total Value Locked (TVL).
+
+Here are some examples:
 ---
 ⚡️Lido TVL
 
-Despite a decrease in Ethereum and Polygon token prices, Lido's TVL saw a slight increase to $14.88b (7d: +0.19%). This divergence suggests strong inflows, underscoring robust investor confidence in Lido.
+Lido TVL dropped -9.75% to $29.81b. This decrease was driven by drastic token prices drop and reinforced by unstaking.
 ---
 ⚡️Lido TVL
 
-Lido TVL increased slightly (7d: +0.17%) due to the inflow of new ETH and SOL staking deposits despite the token prices drop.
-
+Lido TVL grew +3.81% to $30.59b as a result of token prices growth and ETH staking inflow.
+---
+⚡️Lido TVL
 
-TVL at the end of the week - $14.94b.
+Lido TVL dropped by -1.14% to $29.47b due to unstaking.
 ---
 
 Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about Lido's TVL data this week, keeping in mind the correlation between token prices and TVL. You do not need to refer to this correlation directly, but it should influence your tone and terminology. You should write "Lido TVL" instead of "Lido Total Value Locked (TVL)".
 
-Increases and decreases of less than a percent are "slight", but you should use more emphatic words for larger changes.
+In general, TVL changes can be described by indirect forces like token prices dropping or increasing substantially, or by direct forces such as deposits and withdrawals.
+If token price changes are small relative to the TVL change, then it is unlikely this was a factor, and the reason should be attributed to unstaking if it was a TVL decrease, and deposits if increase. There is no need to report token price changes in these cases.
+
+Increases and decreases of less than a percent are "slight", but you should use more emphatic words for larger changes. Keep the text short and within 2 sentences.
 """
 
 netDepositGrowthLeaders_prompt = """
 You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
-Today, you are responsible for writing the section of the thread about the top protocols in net new deposits on the Ethereum Beacon Chain, and Lido's standing in the list. Here are some examples:
+Today, you are responsible for writing the section of the thread about Lido's depositing and unstaking activity on the Ethereum Beacon Chain. 
+Here are some examples:
 ---
 ⚡️ Lido on Ethereum
 
-Lido was at 1st place in net new deposits to the Ethereum Beacon Chain, with 116,945 ETH attracted in 7d.
+Lido secured the 2nd position in net deposits to the Beacon Chain, with a growth of 63,560 ETH in the last 7 days.
 ---
 ⚡️ Lido on Ethereum
 
-Lido took the 2nd place - after Abyss Finance - in net new deposits to the Ethereum Beacon Chain, with 65,943 ETH attracted in 7d.
+-60,992 net ETH was unstaked from Beacon Chain through Lido in the last 7 days. 
 ---
+⚡️ Lido on Ethereum
+
+9,856 net ETH was deposited to the Beacon Chain through Lido in the last 7 days.
+---
+
+Use your knowledge of Lido, the data provided by your boss, and the examples above to present the depositing and unstaking activity of Lido on the Beacon Chain.
+If the data shows that it is a net deposit and the rank is <= 5, you may include it like in the first example. In all other cases, including the unstaking cases, do not include the rank information.
 
-Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about Lido's standing in the list of top protocols in net new deposits on the Ethereum Beacon Chain.
+Follow the examples closely.
 """
 
 stEthToEth_prompt = """
@@ -78,46 +96,102 @@
 
 stEthOnL2Bridges_prompt = """
 You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
-Today, you are responsible for writing the section of the thread about the amount of stETH on L2 bridges. Here are some examples:
+Today, you are responsible for writing the section of the thread about the amount of wstETH on L2 bridges. Here are some examples:
 ---
 🖥️ Lido on L2
 
-The amount of wstETH bridged to L2 grew by +5.58% reaching 97,704 wstETH:
-Arbitrum: 52,466  wstETH (7d: +7.97%) 
-Optimism: 40,464 wstETH (7d: +2.66%)  
-Polygon: 4,774  wstETH (7d: +5.33%) 
+The amount of wstETH bridged to L2 increased by +1.38%, reaching a total of 156,391 wstETH:
+
+Arbitrum: 91,652 wstETH (7d: -1.06%)
+Optimism: 35,833 wstETH (7d: +4.18%)
+Base: 12,230 wstETH (7d: +15.95%)
+Polygon: 7,728 wstETH (7d: -2.53%)
+Scroll: 5,771 wstETH (7d: +0.68%)
+Linea: 1,920 wstETH (7d: +8.68%)
+zkSync Era: 1,246 wstETH (7d: -0.78%)
+
 ---
-The amount of wstETH on L2 increased by +1.53%, hitting 124,255 wstETH:
+🖥️ Lido on L2
+
+The amount of wstETH bridged to L2 decreased by -1.18% to 154,268 wstETH:
+
+Arbitrum: 92,633 wstETH (7d: -3.04%)
+Optimism: 34,394 wstETH (7d: -1.16%)
+Base: 10,548 wstETH (7d: +11.79%)
+Polygon: 7,929 wstETH (7d: -0.16%)
+Scroll: 5,732 wstETH (7d: +5.82%)
+Linea: 1,767 wstETH (7d: +3.06%)
+zkSync Era: 1,256 wstETH (7d: -0.28%)
 
-Arbitrum: 73,275 wstETH (7d: +2.35%)
-Optimism: 46,223 wstETH (7d: +0.67%)
-Polygon: 4,758 wstETH (7d: -2.45%)
 ---
 
-Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about the amount of stETH on L2 bridges.
+Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about the amount of wstETH on L2 bridges.
+Follow the examples closely. 
 """
 
 totalStEthInDeFi_prompt = """
 You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
-Today, you are responsible for writing the section of the thread about the total amount of stETH in DeFi. Here are some examples:
+Today, you are responsible for writing the section of the thread about the total amount of (w)stETH in DeFi. Here are some examples:
 ---
 🌊 stETH in DeFi
 
-Total amount of (w)stETH deposited to DeFi pools decreased by 0.98% sitting at 3.0m stETH, which accounts for 38.37% of total stETH supply.
-
-This estimation takes into account all major L1/L2 lending & liquidity pools.
+The amount of (w)stETH in lending pools reduced by -3.04% to 2.64m stETH. In the meantime the amount in liquidity pools rose by +5.17% to 108.7k stETH. 
 
-The amount of (w)stETH in liquidity pools decreased -0.98% and the amount of (w)stETH in lending pools decrease -0.28%, reaching 124.3k and 2.87m stETH correspondingly
 ---
 🌊 stETH in DeFi
 
-The amount of (w)stETH in liquidity pools increased +1.98% and the amount of (w)stETH in lending pools rose +0.14%, reaching 122.7k and 3.12m stETH correspondingly.
+The amount of (w)stETH in lending pools decreased by -3.02% to 2.56m stETH and the amount in liquidity pools dropped by -14.76% to 92.7k stETH. 
 
-Total (w)stETH deposited to DeFi pools is at 3,242,510 stETH or 34.81% of total stETH supply.
 ---
 Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about the total amount of stETH in DeFi.
 """
 
+bridgedToCosmos_prompt = """
+You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
+Today, you are responsible for writing the section of the thread about the amount of wstETH bridged to Cosmos. Here are some examples:
+---
+🖥️ Lido on Cosmos
+
+wstETH bridged to Cosmos rose to 1,977 wstETH (7d: +1.07%).
+
+---
+🖥️ Lido on Cosmos
+
+wstETH bridged to Cosmos is -2.60% down to 1,875 wstETH.
+
+---
+🖥️ Lido on Cosmos
+
+wstETH bridged to Cosmos is at 1,956 wstETH (7d:  -0.15%).
+
+---
+Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about the amount of wstETH bridged to Cosmos.
+Follow the examples closely.
+"""
+
+stethVolumes_prompt = """
+You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
+Today, you are responsible for writing the section of the thread about the amount of wstETH bridged to Cosmos. Here are some examples:
+---
+🌊 stETH volumes
+
+(w)stETH 7d trading volume is $2.01b, -18.7% lower than last week.
+
+---
+🌊 stETH volumes
+
+(w)stETH 7d trading volume is $1.79b, -10.75% lower compared to the previous week.
+
+---
+🌊 stETH volumes
+
+(w)stETH 7d trading volume is $2.47b, +66.0% higher than last week.
+
+---
+Use your knowledge of Lido, the data provided by your boss, and the examples above to write a section of the thread about the amount of wstETH bridged to Cosmos.
+Follow the examples closely.
+"""
+
 thread_prompt = """
 You are writing a thread about Lido statistics for the week of {start_date} to {end_date}.
 You are a data analytics professional at Lido DAO. Your boss has assigned you to the team that writes weekly twitter threads about Lido statistics.
@@ -128,81 +202,72 @@
 Here are some examples of your past work:
 ---
 1/
-📊 Lido Weekly Digest: January 8 - January 15, 2024
+📊 Lido Weekly Digest: April 22 - April 29, 2024
 
 TLDR:
-- TVL up 12.51% to $23.30b.
-- 2nd in net new staked ETH with +61,728 ETH.
-- stETH APR stable with 7d MA at 3.59%.
-- wstETH on to L2 down -8.60% to 153,756 wstETH.
+- TVL up 3.81% to $30.59b.
+- 9,856 ETH net staked.
+- 7d stETH APR at 3.17%.
+- wstETH on L2 dropped -2.29% to 150,674 wstETH.
+- (w)stETH 7d trading volume at $1.79b.
 
 2/
 ⚡️Lido TVL
 
-Lido TVL surged to $23.30b last week driven by ETH price growth, this represents +12.51% increase compared to the previous week.
-
-
-
+Lido TVL grew +3.81% to $30.59b as a result of token prices growth and ETH staking inflow.
 
 3/
 ⚡️ Lido on Ethereum
 
-Lido secured the 2nd place in net new deposits to the Ethereum Beacon Chain, with the growth of 61,728 ETH in 7 days.
-
-
+9,856 ETH net was deposited to the Beacon Chain through Lido in the last 7 days.
 
 4/
 ⚡️ stETH APR
 
-The stETH APR remained at the same level as previous week, with the 7-day moving average at 3.59%.
-
-
+The 7d stETH APR remained stable during the last week going down 1bp to 3.17%.
 
 5/ 
 🌊 stETH in DeFi
 
-The amount of (w)stETH in liquidity pools increased +1.98% and the amount of (w)stETH in lending pools rose +0.14%, reaching 122.7k and 3.12m stETH correspondingly.
-
-Total (w)stETH deposited to DeFi pools is at 3,242,510 stETH or 34.81% of total stETH supply.
-
-6/ 
-🖥️ Lido on L2
+The amount of (w)stETH in lending pools dropped by -1.86% to 2.52m stETH while the amount in liquidity pools shrank by -3.18% to 89.7k stETH. 
 
-The total amount of wstETH bridged to L2 decreased by -8.60% to 153,756 wstETH. 
-At the same time the L2s with smaller wstETH supply - Polygon, Base and Linea - gained traction this week. 
+6/
+🌊 stETH volumes
 
-Arbitrum: 101,573 wstETH (7d: -11.88%)
-Optimism: 42,848 wstETH (7d: -2.61%)
-Polygon: 6,619 wstETH (7d: +3.24%)
-Base: 2,716 wstETH (7d: +6.31%)
-Linea: 1,595 wstETH (7d: +9.64%)
+(w)stETH 7d trading volume is $1.79b, -10.75% lower compared to previous week.
 
+7/ 
+🖥️ Lido on L2
 
+The total amount of wstETH bridged to L2 decreased by -2.29%, current amount is 150,674 wstETH. 
 
+Arbitrum: 85,963 wstETH (7d: -3.78%)
+Optimism: 34,333 wstETH (7d: -1.99%)
+Base: 12,714 wstETH (7d: +1.44%)
+Polygon: 7,727 wstETH (7d: -1.41%)
+Scroll: 7,022 wstETH (7d: +8.84%)
+Linea: 1,787 wstETH (7d: -2.20%)
+zkSync Era: 1,118 wstETH (7d: -4.61%)
 
-7/ 
+8/ 
 🖥️ Lido on Cosmos
 
-The amount of wstETH on Cosmos has increased moderately to 2,940 wstETH (7d: +2.98%).
+wstETH bridged to Cosmos is at 1,956 wstETH (7d:  -0.15%).
 
-
-
-8/  
+9/  
 Note that by default the data is presented for Monday, 00:00 UTC (unless otherwise specified), with the change presented for the last 7 days.
 
-9/
+10/
 Check out the Lido Dashboards Catalogue https://dune.com/lido/lido-dashboards-catalogue to keep up with further Lido developments.
 🏝️
 
-
-
 -------------------
 
 Final instructions:
 Be sure to create a succint TL;DR section that summarizes the most important information from the thread.
-You must be sure to include every provided block in the thread, and follow the format of the examples.
-You should use a more varied vocabulary than the examples provided. 
-For example, instead of always saying "increase" or "decrease", you can use words like "dropped", "soared", "plumetted", "rose", "shrank", "jumped up", etc. 
+You must be sure to include every provided block in the thread, and follow the format of the examples closely. Do not omit any data in any block.
+You can use a more varied vocabulary than the examples provided. 
+For example, instead of always saying "increase" or "decrease", you can use words like "dropped", "soared", "plummeted", "rose", "shrank", "jumped up", etc. 
 Be sure to use the correct word for the situation. 
 For example, a small change should not be described as "soaring" or "plummeting".
 
diff --git a/main.py b/main.py
index 05b1655..848414e 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 import argparse
 from dune.loader import load
-from dune.process import process_dune
+from utils.data_transformer import DataTransformer
 from graphing.graph import Grapher
 from pathlib import Path
 import datetime
@@ -21,7 +21,9 @@ def main(
     start_time = time.time()  # start timing
     dune_loaded = load(str(start_date), str(end_date), sol_start_deposits, sol_end_deposits)
     # dune_loaded = pickle.load(open('data/dune_data_2024-02-18_13-23.pkl', 'rb'))
-    processed = process_dune(dune_loaded)
+    data_transformer = DataTransformer(start_date, end_date)
+    dune_enriched = data_transformer.enrich_dune(dune_loaded)
+    processed = data_transformer.process_dune(dune_enriched)
 
     writer = BlockWriter(str(end_date), str(start_date))
     thread = writer.compose_thread(processed)
@@ -36,8 +38,8 @@ def main(
     print(f"Wrote thread to file in {save_location}/thread.md")
 
     print("Graphing")
-    grapher = Grapher(str(end_date))
-    grapher.process_all(dune_loaded)
+    grapher = Grapher(start_date, end_date)
+    grapher.process_all(dune_enriched)
     print(f"Done Graphing. Graphs are saved in {save_location}/graphs folder")
     end_time = time.time()
     print(f"Time taken: {end_time - start_time} seconds")
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/cex_data_loader.py b/utils/cex_data_loader.py
new file mode 100644
index 0000000..de8d71e
--- /dev/null
+++ b/utils/cex_data_loader.py
@@ -0,0 +1,381 @@
+import pandas as pd
+import numpy as np
+import requests
+import json
+from datetime import datetime, timedelta
+import logging
+
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    level=logging.INFO
+)
+
+class CEXDataLoader:
+
+    def __init__(self, start_date: datetime, end_date: datetime):
+        # all functions are inclusive of start_date and end_date
+        self.start_date = start_date
+        self.end_date = end_date
+        self.period = (self.end_date - self.start_date).days + 1 # to be inclusive of both dates
+        self.exchange_functions = {
+            'hitbtc': self.fetch_hitbtc_daily_data,
+            'mexc': self.fetch_mexc_daily_data,
+            'coinsbit': self.fetch_coinsbit_daily_data,
+            'gateio': self.fetch_gateio_daily_data,
+            'okx': self.fetch_okx_daily_data,
+            'bybit': self.fetch_bybit_daily_data,
+            'huobi': self.fetch_huobi_daily_data,
+            'bitrue': self.fetch_bitrue_daily_data,
+            'dcoin': self.fetch_dcoin_daily_data,
+            'azbit': self.fetch_azbit_daily_data,
+            'cointr': self.fetch_cointr_daily_data,
+            'bitget': self.fetch_bitget_daily_data,
+        }
+            
+    def get_data_formated(self, data: pd.DataFrame, pair: str) -> pd.DataFrame: 
+        data['symbol'] = pair
+        data = data.set_index('date')
+        for col in ['open','close','high','low','volume','volume_quote','baseVol' ,'c','o','h','l','vol','v','currencyVol']:
+            if col in data.columns:
+                if type(data[col].values[0]) == str:
+                    data[col] = data[col].str.replace(',','').astype(np.float64)
+        return data
+
+    # https://api.hitbtc.com/#candles
+    def fetch_hitbtc_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '')
+        url = f'https://api.hitbtc.com/api/3/public/candles/{symbol}?period=D1&from={self.start_date}&till={self.end_date}'
+        response = requests.get(url)
+        if response.status_code == 200:  
+            data = pd.DataFrame(json.loads(response.text), columns=['timestamp', 'open', 'close', 'min', 'max', 'volume', 'volume_quote'])
+            if data.empty:
+                logging.info(f"Did not return any data from HitBTC for {pair}")
+                return pd.DataFrame()
+            data.rename(columns = {'min':'low', 'max':'high'}, inplace = True)
+            data['date'] = pd.to_datetime(data['timestamp']).dt.tz_localize(None) 
+            data = self.get_data_formated(data, pair)
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from HitBTC API for {pair}")
+            return pd.DataFrame()
+
+    # https://mxcdevelop.github.io/APIDoc/
+    def fetch_mexc_daily_data(self, pair: str) -> pd.DataFrame:
+        timestamp_from = int(datetime.timestamp(self.start_date))
+        symbol = pair.replace('/', '_')
+        url = f'https://www.mexc.com/open/api/v2/market/kline?interval=1d&symbol={symbol}&start_time={timestamp_from}&limit={self.period}'
+        response = requests.get(url)
+        if response.status_code == 200:  
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=['timestamp', 'open', 'close', 'high', 'low',  'volume', 'volume_quote'])
+            if data.empty:
+                logging.info(f"Did not return any data from MEXC for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['timestamp'], unit='s')
+            data = self.get_data_formated(data, pair)
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from MEXC API for {pair}")  
+            return pd.DataFrame()
+
+    # https://www.notion.so/coinsbitwsapi/API-COINSBIT-WS-API-COINSBIT-cf1044cff30646d49a0bab0e28f27a87
+    def fetch_coinsbit_daily_data(self, pair: str) -> pd.DataFrame:
+        timestamp_from = int(datetime.timestamp(self.start_date))
+        timestamp_to = int(datetime.timestamp(self.end_date))
+        symbol = pair.replace('/', '_')
+        params = {"market": symbol, "interval": 86400, "start": timestamp_from, "end": timestamp_to}
+        url = 'https://coinsbit.io/api/v1/public/kline'
+        response = requests.get(url, params=params)
+        if response.status_code == 200 and json.loads(response.text)['success'] == True:
+            data = pd.DataFrame(json.loads(response.text)['result']['kline'], columns=['time', 'close', 'open', 'highest', 'lowest', 'volume', 'amount', 'market' ])
+            if data.empty:
+                logging.info(f"Did not return any data from Coinsbit for {pair}")
+                return pd.DataFrame()
+            data.rename(columns = {'time':'timestamp', 'highest':'high', 'lowest':'low', 'amount':'volume_quote'}, inplace = True)
+            data['date'] = pd.to_datetime(data['timestamp'], unit='s')
+            data = self.get_data_formated(data, pair)
+            return data[['volume']]
+        else:
+            logging.info("Did not receieve OK response from Coinsbit API for {pair}")  
+            return pd.DataFrame()
+
+    # https://www.gate.io/api2#kline
+    def fetch_gateio_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '_')
+        url = f'https://data.gateapi.io/api2/1/candlestick2/{symbol}?group_sec=86400&range_hour={(datetime.now() - self.start_date).days*24}'
+        response = requests.get(url)
+        if response.status_code == 200 and len(response.text)>100:  
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=["timestamp", "volume", "open", "high", "low", "close"])
+            if data.empty:
+                logging.info(f"Did not return any data from Gate.io for {pair}")
+                return pd.DataFrame()
+            data['timestamp'] = data['timestamp'].astype(int)
+            data['date'] = pd.to_datetime(data['timestamp'], unit='ms')
+            data = self.get_data_formated(data, pair).query('@self.start_date<=index<=@self.end_date')
+            data['volume_quote'] = data['volume']*data['close']
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from Gate.io API for {pair}")  
+            return pd.DataFrame()
+
+    # https://www.okx.com/docs-v5/en/#order-book-trading-market-data-get-candlesticks
+    def fetch_okx_daily_data(self, pair: str) -> pd.DataFrame:
+        # * 1000 to convert to ms, and we need to add 1 more day as the API results is exclusive of end_date
+        timestamp_to = (int(datetime.timestamp(self.end_date))+86400) * 1000
+        symbol = pair.replace('/', '-')
+        url = f'https://www.okx.com/api/v5/market/candles?instId={symbol}&bar=1Dutc&after={timestamp_to}&limit={self.period}'
+        response = requests.get(url)
+        if response.status_code == 200:
+            data = pd.DataFrame(
+                json.loads(response.text)['data'],
+                columns=['timestamp','open', 'high', 'low', 'close', 'volume', 'volume_currency', 'volume_quote', 'confirm']
+            )
+            if data.empty:
+                logging.info(f"Did not return any data from OKX for {pair}")
+                return pd.DataFrame()
+            data['timestamp'] = data['timestamp'].astype(int)
+            data['date'] = pd.to_datetime(data['timestamp'], unit='ms')
+            data['date'] = data['date'].dt.date
+            data = self.get_data_formated(data, pair)
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from OKX API for {pair}")  
+            return pd.DataFrame()
+
+    # https://bybit-exchange.github.io/docs/v5/market/kline
+    def fetch_bybit_daily_data(self, pair: str) -> pd.DataFrame:
+        timestamp_from = int(datetime.timestamp(self.start_date)) * 1000 # as ms
+        timestamp_to = int(datetime.timestamp(self.end_date)+86400) * 1000 # as ms
+        symbol = pair.replace('/', '')
+        url = f'https://api.bybit.com/v5/market/kline?category=spot&symbol={symbol}&interval=D&end={timestamp_to}&start={timestamp_from}'
+        response = requests.get(url)
+        if response.status_code == 200 and len(json.loads(response.text)['result']) > 0:
+            data = pd.DataFrame(
+                json.loads(response.text)['result']['list'],
+                columns=['t', 'o', 'h', 'l', 'c', 'v', 'volume_quote']
+            )
+            if data.empty:
+                logging.info(f"Did not return any data from Bybit for {pair}")
+                return pd.DataFrame()
+            data['t'] = data['t'].astype(int)
+            data['date'] = pd.to_datetime(data['t'], unit='ms')
+            data['date'] = data['date'].dt.date
+            
+            data = self.get_data_formated(data, pair)
+            return data[['v']]
+        else:
+            logging.info(f"Did not receieve OK response from Bybit API for {pair}")  
+            return pd.DataFrame()
+
+    # https://huobiapi.github.io/docs/spot/v1/en/#get-klines-candles
+    def fetch_huobi_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '').lower()
+        params = {"symbol": symbol, "period": "1day", "size": (datetime.now() - self.start_date).days + 1}
+        response = requests.get('https://api.huobi.pro/market/history/kline', params=params)
+        if response.status_code == 200 and json.loads(response.text)['status'] == 'ok':
+            data = pd.DataFrame(json.loads(response.text)['data'])
+            if data.empty:
+                logging.info(f"Did not return any data from Huobi for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['id'], unit='s')
+            data['date'] = data['date'].dt.date
+            data.rename(columns = {'amount':'volume', 'vol':'volume_quote'}, inplace = True)
+            data = self.get_data_formated(data, pair).query('@self.start_date.date()<=index<=@self.end_date.date()')
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from Huobi API for {pair}")  
+            return pd.DataFrame()
+        
+    #https://github.com/Bitrue-exchange/Spot-official-api-docs#kline_endpoint    
+    def fetch_bitrue_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '')
+        url = f'https://openapi.bitrue.com/api/v1/market/kline?symbol={symbol}&scale=1D&limit={(datetime.now() - self.start_date).days + 1}'
+        response=requests.get(url)
+        if response.status_code == 200:
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=['is', 'o', 'h', 'l', 'c', 'v'])
+            if data.empty:
+                logging.info(f"Did not return any data from BiTrue for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['is'],unit='ms').dt.tz_localize(None).dt.date
+            data = self.get_data_formated(data, pair) 
+            data=data.sort_values(['date'],ascending=False)
+            data['volume_quote'] = data['v']*data['c']
+            return data[['v']]
+        else:
+            logging.info(f"Did not receieve OK response from BiTrue API for {pair}")
+            return pd.DataFrame()    
+
+    #https://github.com/dcoinapi/openapi/wiki/HttpApi#get-kline-records
+    def fetch_dcoin_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '').lower()
+        url = f'https://openapi.dcoin.com/open/api/get_records?symbol={symbol}&period=1day&size={(datetime.now() - self.start_date).days + 1}'
+        response=requests.get(url)
+        if response.status_code == 200:
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=['id', 'amount', 'vol', 'open', 'close', 'high', 'low'])
+            if data.empty:
+                logging.info(f"Did not return any data from DCoin for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['id']*1000,unit='ms').dt.tz_localize(None).dt.date  
+            data = self.get_data_formated(data, pair) 
+            data=data.sort_values(['date'],ascending=False)
+            data['volume_quote_corr'] = 0.5*data['vol']*data['close']
+            return data[['vol']]
+        else:
+            logging.info(f"Did not receieve OK response from DCoin API for {pair}")
+            return pd.DataFrame()    
+
+    #https://data.azbit.com/swagger/index.html            
+    def fetch_azbit_daily_data(self, pair: str) -> pd.DataFrame:
+        symbol = pair.replace('/', '_')
+        params = {
+            'interval': 'day',
+            'currencyPairCode': symbol,
+            'start': self.start_date,
+            'end': self.end_date + timedelta(days=1) # the API is exclusive of end date, so we must add a day
+            }
+        url = 'https://data.azbit.com/api/ohlc'
+        response=requests.get(url, params=params)
+        if response.status_code == 200:
+            data = pd.DataFrame(json.loads(response.text), columns=['date', 'open', 'max', 'min', 'close', 'volume', 'volumeBase'])
+            if data.empty:
+                logging.info(f"Did not return any data from Azbit for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['date']).dt.date
+            
+            data = self.get_data_formated(data, pair) 
+            data=data.sort_values(['date'],ascending=False)
+            data['volume_quote'] = data['volume']*data['close']
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from Azbit API for {pair}")
+            return pd.DataFrame()             
+
+    #https://cointr-ex.github.io/openapis/spot.html#market-specifications
+    def fetch_cointr_daily_data(self, pair: str) -> pd.DataFrame:  
+        timestamp_to = int(datetime.timestamp(self.end_date))
+        symbol = pair.replace('/', '')
+        params = {
+            'instId': symbol,
+            'bar': '1D',
+            'endTs': timestamp_to,
+            'limit': self.period 
+        }
+        url = 'https://api.cointr.pro/v1/spot/market/candlesticks'
+        response=requests.get(url, params=params)
+        if response.status_code == 200:
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=['time', 'open', 'high', 'low','close', 'volume','quotevolume']) 
+            if data.empty:
+                logging.info(f"Did not return any data from CoinTR for {pair}")
+                return pd.DataFrame()
+            data['date'] = pd.to_datetime(data['time'], unit='s').dt.tz_localize(None).dt.date  
+            data['date'] = pd.to_datetime(data['date']).dt.date     
+            
+            data = self.get_data_formated(data, pair) 
+            data=data.sort_values(['date'],ascending=False)
+            data['volume_quote'] = data['volume']*data['close']
+            return data[['volume']]
+        else:
+            logging.info(f"Did not receieve OK response from CoinTR API for {pair}")
+            return pd.DataFrame()
+
+    #https://bitgetlimited.github.io/apidoc/en/spot/#get-candle-data
+    def fetch_bitget_daily_data(self, pair: str) -> pd.DataFrame:
+        timestamp_from = int(datetime.timestamp(self.start_date)) * 1000
+        timestamp_to = (int(datetime.timestamp(self.end_date))+86400) * 1000
+        symbol = pair.replace('/', '') + '_SPBL'
+        params = {
+            'symbol': symbol,
+            'period': '1day',
+            'after': timestamp_from,
+            'before': timestamp_to
+        }
+        url = 'https://api.bitget.com/api/spot/v1/market/candles'
+        response=requests.get(url, params=params)
+        if response.status_code == 200:
+            data = pd.DataFrame(json.loads(response.text)['data'], columns=['open', 'high', 'low','close', 'quoteVol','baseVol', 'usdtVol', 'ts'])
+            if data.empty:
+                logging.info(f"Did not return any data from BitGet for {pair}")
+                return pd.DataFrame()
+            data['ts'] = data['ts'].astype(int)
+            data['date'] = pd.to_datetime(data['ts'],unit='ms').dt.tz_localize(None).dt.date
+            data = self.get_data_formated(data, pair) 
+            data=data.sort_values(['date'],ascending=False)
+            data['volume_quote'] = data['baseVol']*data['close']
+            return data[['baseVol']]
+        else:
+            logging.info(f"Did not receieve OK response from BitGet API for {pair}")
+            return pd.DataFrame()
+
+    def get_klines_by_exchange_pair(self, exchange: str, pair: str) -> pd.DataFrame:
+        fetch_func = self.exchange_functions.get(exchange)
+        if fetch_func:
+            result = fetch_func(pair)
+            return result
+        else:
+            logging.info(f"No data for {exchange}")
+
+    def get_klines(self, pairs: list[str]) -> dict[tuple[str, str], pd.DataFrame]:
+        klines_by_exchange = {}
+        for exchange in self.exchange_functions.keys():
+            for pair in pairs:
+                klines_by_exchange.update({(exchange, pair): self.get_klines_by_exchange_pair(exchange, pair)})
+        return klines_by_exchange
+
+    def get_trading_volume(self, symbol: str) -> pd.DataFrame:
+        timestamp_to = int(datetime.timestamp(self.end_date)) + 86400
+        period_required = max(self.period, 91) # for this API endpoint, to return daily data, it requires minimum 90 day period
+        timestamp_from = timestamp_to - period_required * 86400
+        url = f'https://api.coingecko.com/api/v3/coins/{symbol}/market_chart/range?vs_currency=usd&from={timestamp_from}&to={timestamp_to}'
+        response = requests.get(url)
+        if response.status_code == 200: 
+            data = pd.DataFrame(json.loads(response.text)['prices'], columns=['timestamp','price'])
+            data['date'] = pd.to_datetime(data['timestamp'], unit='ms').dt.date
+            data = data[['date', 'price']].set_index('date')
+            data_ = pd.DataFrame(json.loads(response.text)['total_volumes'], columns=['timestamp','total_volume_usd'])
+            data_['date'] = pd.to_datetime(data_['timestamp'], unit='ms').dt.date
+            data_ = data_[['date', 'total_volume_usd']].set_index('date')
+            data = pd.merge(data, data_, left_index = True, right_index = True)
+            data['total_volume'] = data.total_volume_usd/data.price
+            data = data.query('@self.start_date.date()<=index<=@self.end_date.date()')
+        return data[['total_volume', 'price']]
+    
+    def get_offchain_df(self) -> pd.DataFrame:
+        all_steth_pairs = [
+            "STETH/USDT", "STETH/USDC", "STETH/LUSD", "STETH/USD", "STETH/DAI",
+            "STETH/BUSD", "STETH/USDP", "STETH/TUSD", "STETH/WBTC", "STETH/BTC",
+            "STETH/LDO", "STETH/BTC","STETH/EUR", "STETH/WETH", "STETH/ETH"
+        ]
+
+        # get coingecko price
+        steth_trading_volume = self.get_trading_volume('staked-ether')
+
+        # get volume on exchanges 
+        stethtot_klines = self.get_klines(all_steth_pairs)
+        stethtot_offchain_all = []
+        for key in stethtot_klines.keys():
+            if stethtot_klines[key].empty == False: 
+                k = stethtot_klines[key].copy()
+                key1 = (key[0], key[1].replace('STETH/USDT', 'Stablecoins').replace('STETH/USDC', 'Stablecoins').replace('STETH/ETH', '(W)ETH').replace('STETH/BTC', 'Others').replace('STETH/USD', 'Stablecoins'))
+                k.columns = [key1]
+                stethtot_offchain_all.append(k)
+
+        df_stethtot_offchain = steth_trading_volume.copy()
+        for kline in stethtot_offchain_all:
+            df_stethtot_offchain = pd.merge(df_stethtot_offchain, kline, how = 'left', left_index = True, right_index = True).fillna(0)
+
+        # remove first 2 columns 'total_volume' and 'price' from index
+        cols = df_stethtot_offchain.columns
+        cols = cols.delete(0)
+        cols = cols.delete(0)
+        # multiply other columns (individual exchange+pair volumes) with price
+        df_stethtot_offchain[cols] = df_stethtot_offchain[cols].mul(df_stethtot_offchain.price, axis = 0)
+
+        # drop total_volume and price from dataframe
+        df_stethtot_offchain.drop(['total_volume', 'price'], axis=1, inplace=True)
+
+        # calculate total volume by summing across row
+        df_stethtot_offchain['total_volume'] = df_stethtot_offchain.loc[:,cols].sum(axis=1)
+
+        df_stethtot_offchain = df_stethtot_offchain[['total_volume']]
+        df_stethtot_offchain = df_stethtot_offchain.rename(columns = {'total_volume': 'volume'})
+        return df_stethtot_offchain
diff --git a/utils/data_transformer.py b/utils/data_transformer.py
new file mode 100644
index 0000000..85918f7
--- /dev/null
+++ b/utils/data_transformer.py
@@ -0,0 +1,307 @@
+import pandas as pd
+from datetime import datetime, timedelta
+from utils.cex_data_loader import CEXDataLoader
+import logging
+
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    level=logging.INFO
+)
+
+class DataTransformer:
+
+    def __init__(self, start_date: datetime, end_date: datetime):
+        self.start_date = start_date
+        self.end_date = end_date
+        # the enrich functions enrich dune data with additional off-chain data where applicable
+        self.enrich_functions = {
+            "stethVolumes": DataTransformer.enrich_stethVolumes
+        }
+        # the processing functions is after enrichment
+        # it takes in the enriched dataframe and produces a string, to be used in the prompt
+        self.process_functions = {
+            "tvl": DataTransformer.process_tvl,
+            "netDepositGrowthLeaders": DataTransformer.process_netDepositGrowthLeaders,
+            "stETHApr": DataTransformer.process_stETHApr,
+            "stEthToEth": DataTransformer.process_stEthToEth,
+            # "dexLiquidityReserves": DataTransformer.process_dexLiquidityReserves,
+            "bridgeChange": DataTransformer.process_bridgeChange,
+            "totalStEthInDeFi": DataTransformer.process_totalStEthInDeFi,
+            "bridgedToCosmos": DataTransformer.process_bridgedToCosmos,
+            "stethVolumes": DataTransformer.process_stethVolumes
+        }
+
+
+    def enrich_stethVolumes(df: pd.DataFrame, start_date: datetime, end_date: datetime) -> pd.DataFrame:
+        # on-chain section (dune)
+        df['date'] = pd.to_datetime(df['day']).dt.date
+        df = df[['date','chain','volume']].groupby(['date','chain']).agg({'volume': ['sum']}).reset_index()
+        df.columns = ['date', 'chain', 'volume']
+        chainlist = []
+        for d in df['chain']:
+            if d not in chainlist:
+                chainlist.append(d)
+        tv_by_chain = {}
+        for chain in chainlist:
+            tv_by_chain.update({(chain): df.query('chain==@chain')[['date','volume']].set_index('date')})
+        
+        stethtot_klines_chain = []
+        for key in tv_by_chain.keys():
+            if tv_by_chain[key].empty == False: 
+                k = tv_by_chain[key].copy()
+                k.columns = [key]
+                stethtot_klines_chain.append(k)
+    
+        # off-chain section (exchange APIs)
+
+        # first we need to extend the start date to include 1 more period before
+        # e.g. if start_date = 2024-04-29 and end_date = 2024-05-05, we want to have extended_start_date = 2024-04-22
+        # we need this to calculate the values for 1 week before, so we can say, the values rose / drop by X%
+        period = (end_date - start_date).days + 1
+        extended_start_date = end_date - timedelta(days = 2 * period - 1)
+        logging.info(f"Loading offchain CEX data with start date as {extended_start_date} and end date as {end_date}...")
+        cex_data_loader = CEXDataLoader(extended_start_date, end_date)
+        df_stethtot_offchain = cex_data_loader.get_offchain_df()
+
+        # merge on-chain with off-chain
+        df_stethtot_chain = df_stethtot_offchain
+        for kline in stethtot_klines_chain:
+            df_stethtot_chain = pd.merge(df_stethtot_chain, kline, how = 'left', left_index = True, right_index = True).fillna(0)
+        df_stethtot_chain.rename(columns = {'volume': 'off_chain'}, inplace = True)
+
+        # df_stethtot_chain.to_csv('df_stethtot_chain.csv')
+        # df_stethtot_chain = pd.read_csv('df_stethtot_chain.csv', index_col='date')
+        return df_stethtot_chain
+
+
+    def enrich_dune(self, dune_results: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
+        res = {}
+
+        for df_name, df in dune_results.items():
+            enrich_func = self.enrich_functions.get(df_name)
+            if enrich_func is not None:
+                res[df_name] = enrich_func(df, self.start_date, self.end_date)
+            else:
+                res[df_name] = df
+        return res
+
+    def process_tvl(df: pd.DataFrame) -> str:
+        # Select the row corresponding to 'Total'
+        total_row = df[df["chain"] == "Total"]
+
+        # Extract the 'TVL' and 'TVL change, %' from the total row
+        total_tvl = total_row["TVL"].values[0]
+        total_tvl_change = total_row["TVL change, %"].values[0] * 100
+
+        # Format the total TVL and total TVL change into strings
+        total_tvl_str = f"${total_tvl / 1e9:.2f}b"
+        total_tvl_change_str = f"{total_tvl_change:.2f}%"
+
+        # Select the rows corresponding to 'Ethereum' and 'Polygon'
+        eth_row = df[df["chain"] == "Ethereum"]
+        polygon_row = df[df["chain"] == "Polygon"]
+
+        # Extract the 'Token price change, %' from the Ethereum and Polygon rows
+        eth_price_change = float(eth_row["Token price change, %"].values[0]) * 100
+        polygon_price_change = float(polygon_row["Token price change, %"].values[0]) * 100
+
+        # Format the Ethereum and Polygon price changes into strings
+        eth_price_change_str = f"{eth_price_change:.2f}%"
+        polygon_price_change_str = f"{polygon_price_change:.2f}%"
+
+        # Combine everything into a result string
+        result_string = (
+            f"TVL: {total_tvl_str}\n"
+            f"TVL Percentage Change: {total_tvl_change_str}\n"
+            f"Ethereum Token Price Change: {eth_price_change_str}\n"
+            f"Polygon Token Price Change: {polygon_price_change_str}"
+        )
+
+        return result_string
+
+
+    def process_netDepositGrowthLeaders(df: pd.DataFrame) -> str:
+
+        # Find Lido's stats
+        lido_stats = df[df["name"] == "Lido"]
+
+        # If Lido is not in the list, return None for both values
+        if lido_stats.empty:
+            return ""
+
+        lido_net_deposit_growth = round(lido_stats.iloc[0]["eth_deposits_growth"], 0)
+
+        # If net deposit is positive (i.e. more deposits than withdrawals)
+        if lido_net_deposit_growth >= 0:
+            lido_rank = lido_stats.iloc[0]["eth_deposits_rank"]
+            return f"Lido had net deposit growth of {lido_net_deposit_growth} ETH. ETH deposits rank: {lido_rank}"
+        # If net deposit is negative
+        else:
+            lido_rank = lido_stats.iloc[0]["eth_withdrawals_rank"]
+            return f"Lido had net unstake of {lido_net_deposit_growth} ETH. ETH unstaking rank: {lido_rank}"
+
+    def process_stETHApr(df: pd.DataFrame) -> str:
+        # Sort DataFrame by time in descending order
+        df = df.sort_values("time", ascending=False)
+        
+        # Get the most recent 7d moving average
+        recent_7d_ma = df["stakingAPR_ma_7"].values[0]
+        week_ago_7d_ma = df["stakingAPR_ma_7"].values[7]
+
+        difference_in_bps = (recent_7d_ma - week_ago_7d_ma) * 10000
+        # Format the result into a string
+        result_string = f"7d MA: {recent_7d_ma:.2%}, change from 7d ago: {difference_in_bps:.0f}bps"
+
+        return result_string
+
+    def process_stEthToEth(df: pd.DataFrame) -> str:
+        # Convert 'time' column to datetime
+        df["time"] = pd.to_datetime(df["time"])
+
+        # Sort DataFrame by time in descending order
+        df = df.sort_values("time", ascending=False)
+
+        # Get the most recent 'weight_avg_price'
+        recent_price = df["weight_avg_price"].iloc[0]
+
+        # Calculate the volatility of the 'weight_avg_price'
+        volatility = df["weight_avg_price"].std()
+
+        # Return results as a string
+        return f"The week ended with a ratio of {recent_price}. The ratio over the week had a standard deviation of {volatility}"
+
+
+    def process_dexLiquidityReserves(df: pd.DataFrame) -> str:
+        # Select the row corresponding to 'total'
+        total_row = df[df["token"] == "total"]
+
+        # Extract the 'end value' and 'period_change' from this row
+        end_value = total_row["end value"].values[0]
+        period_change = total_row["period_change"].values[0]
+
+        # Format the end value into a string in billions with 2 decimal places
+        end_value_str = f"${end_value / 1e9:.0f}b"
+
+        # Format the period change into a string as a percentage with 2 decimal places
+        period_change_str = f"{period_change * 100:.2f}%"
+
+        # Combine these into a result string
+        result_string = f"Total End Value: {end_value_str}\nPeriod Change: {period_change_str}"
+
+        return result_string
+
+
+    def process_totalStEthInDeFi(df: pd.DataFrame) -> str:
+
+        # Order the dataframe from oldest to latest
+        df = df.sort_values("time")
+
+        # Get the latest row, that is the data at the end_date
+        latest_row = df.iloc[-1]
+
+        # Format the changes into a string
+        result = (
+            f"Liquidity Pools Balance: {latest_row['liquidity_pools_balance']:.0f}\n"
+            f"Liquidity Pools Percentage Change: {float(latest_row['liquidity_pct_change']):.2f}%\n"
+            f"Lending Pools Balance: {latest_row['lending_pools_balance']:.0f}\n"
+            f"Lending Pools Percentage Change: {float(latest_row['lending_pct_change']):.2f}%"
+        )
+
+        return result
+
+
+    def process_stEthOnL2(df: pd.DataFrame) -> str:
+        # Select the row corresponding to 'total'
+        total_row = df[df["bridge"] == "total"]
+
+        # Extract the 'end_amount' and 'period_change' from the total row
+        total_end_amount = total_row["end_amount"].values[0]
+        total_period_change = total_row["period_change"].values[0]
+
+        # Format the total end amount and total period change into strings
+        total_end_amount_str = f"{round(total_end_amount, 0)} wstETH"
+        total_period_change_str = f"{total_period_change:.2f}%"
+
+        # Initialize an empty string to store the individual bridge data
+        bridge_data_str = ""
+
+        # Loop over the rows of the DataFrame
+        for i, row in df.iterrows():
+            # Skip the total row
+            if row["bridge"] == "total":
+                continue
+
+            # Extract the 'bridge', 'end_amount' and 'period_change' for each row
+            bridge = row["bridge"]
+            end_amount = row["end_amount"]
+            period_change = row["period_change"]
+
+            # Format the end amount and period change into strings
+            end_amount_str = f"{end_amount:.0f} wstETH"
+            period_change_str = f"{period_change:.2f}%"
+
+            # Append this bridge's data to the bridge data string
+            bridge_data_str += f"{bridge}: {end_amount_str} (7d: {period_change_str})\n"
+
+        # Combine the total and bridge data into a result string
+        result_string = f"The amount of wstETH on L2 changed by {total_period_change_str}, hitting {total_end_amount_str}:\n\n{bridge_data_str}"
+
+        return result_string
+
+
+    def process_bridgeChange(df: pd.DataFrame) -> str:
+        # Apply rounding logic to the 'period_change' column
+        df['period_change'] = df['period_change'].apply(lambda x: round(x, 2))
+        df['end_amount'] = df['end_amount'].apply(lambda x: int(round(x, 0)))
+        df['start_amount'] = df['start_amount'].apply(lambda x: int(round(x, 0)))
+
+        # Convert the DataFrame to a string
+        df_string = df.to_string(index=False)
+
+        return df_string
+
+    def process_bridgedToCosmos(df: pd.DataFrame) -> str:
+        # Sort by day
+        df = df.sort_values("day")
+
+        # Get values 
+        balance = df.iloc[-1]['balance_cumu']
+        balance_7d_ago = df.iloc[-8]['balance_cumu']
+        pct_change = (balance/balance_7d_ago - 1) * 100
+
+        result_string = f"The balance of wstETH on Cosmos is {balance:.0f}, with a 7d change of {pct_change:.2f}%."
+
+        return result_string
+    
+    def process_stethVolumes(df: pd.DataFrame) -> str:
+
+        dates = pd.to_datetime(df.index)
+        # the df here contains 2 periods. so we need to split into current period and previous period
+        min_date = dates.min()
+        max_date = dates.max()
+        
+        period_length = (max_date - min_date + timedelta(days = 1)) / 2
+        # this is start_date of current period
+        start_date = min_date + period_length
+        previous_sum = df[pd.to_datetime(df.index) < start_date].sum().sum()
+        current_sum = df[pd.to_datetime(df.index) >= start_date].sum().sum()
+        pct_change = (current_sum/previous_sum - 1) * 100
+
+        result_string = (
+            f"{period_length.days}d trading volume: ${current_sum}\n"
+            f"Previous trading volume: ${previous_sum}\n"
+            f"Percentage change: {pct_change}"
+        )
+        return result_string
+
+    def process_dune(self, dune_results: dict[str, pd.DataFrame]) -> dict[str, str]:
+        res = {}
+
+        for df_name, df in dune_results.items():
+            process_func = self.process_functions.get(df_name)
+            if process_func is not None:
+                s = process_func(df)
+                res[df_name] = s
+
+        return res
\ No newline at end of file