# **<div align="center"> CREATING SUBSETS FOR EACH CRYPTOCURRENCY AND SUPERFICIAL CLEAN </div>** 

In [1]:
import numpy as np
import pandas as pd
from typing import List

In [2]:
def eliminate_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:

    '''
    Delete columns in a dataframe

    Parameters:
    df (pd.DataFrame): A DataFrame
    columns: A list with the name of columns for deleting

    Returns:
    pd.DataFrame: A dataframe without the colums
    '''
    
    df.drop(columns = columns, inplace=True)
    return df

In [3]:
def fill_dates(df: pd.DataFrame) -> pd.DataFrame:

    '''
    Fills missing dates in a time series DataFrame by inserting all dates 
    in the full range (from the earliest to the latest date). Missing rows 
    will be filled with NaNs.

    Parameters:
    df (pd.DataFrame): A DataFrame with a 'date' column and daily data 
                       (may have missing days).

    Returns:
    pd.DataFrame: The reindexed DataFrame with all dates in the range included. Missing dates will have NaN values in the other columns.
    '''

    # Set column date as index
    df = df.set_index('date')

    # Create the complete set of days inside the range
    complete_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')

    # Reindex with the complete dates
    df = df.reindex(complete_dates)
    df.index.name = 'date' 

    return df

    
    

In [4]:
def export_dataset(df: pd.DataFrame, path: str, include_index: bool = False) -> None:

    '''
    Export the DataFrame to a CSV file.

    Parameters:
    df (pd.DataFrame): The DataFrame to export.
    path (str): The file path where to save the CSV.
    include_index (bool): Whether to include the index in the CSV file (default False).

    Returns:
    None: This function prints status messages but does not return a value.
    '''
    
    if not isinstance(path, str):
        raise ValueError("The path must be a string.")

    try:
        df.to_csv(path, index=include_index)
        print(f"Data exported successfully to {path}")
    except Exception as e:
        print(f"Error exporting data: {e}")

## **Load data**

In [5]:
crypto = pd.read_csv("../data/raw/db_23062025.csv")
crypto['date'] = pd.to_datetime(crypto['date'])
crypto.head()

Unnamed: 0,id,cryptocurrency_id,date,price_usd,cryptocurrency_name,market_cap,volume,change
0,1,1,2025-02-05 12:03:40,97864.0,bitcoin,1940957000000.0,63657360000.0,-1.048094
1,2,2,2025-02-05 12:03:40,2780.5,ethereum,335253800000.0,44211260000.0,1.025537
2,3,3,2025-02-05 12:03:40,1.0,tether,140530700000.0,107324400000.0,-0.0383
3,4,4,2025-02-05 12:03:40,571.03,binancecoin,83275330000.0,1241403000.0,-2.18004
4,5,1,2025-02-06 12:05:49,99038.0,bitcoin,1960398000000.0,42439410000.0,1.30476


In [6]:
# As I took data each 24h, I am going to delete hours

crypto['date'] = pd.to_datetime(crypto['date']).dt.date # delete hour
# crypto['date'] = pd.to_datetime(crypto['date']).dt.normalize() #convert all hours to 00:00:00

crypto.head()

Unnamed: 0,id,cryptocurrency_id,date,price_usd,cryptocurrency_name,market_cap,volume,change
0,1,1,2025-02-05,97864.0,bitcoin,1940957000000.0,63657360000.0,-1.048094
1,2,2,2025-02-05,2780.5,ethereum,335253800000.0,44211260000.0,1.025537
2,3,3,2025-02-05,1.0,tether,140530700000.0,107324400000.0,-0.0383
3,4,4,2025-02-05,571.03,binancecoin,83275330000.0,1241403000.0,-2.18004
4,5,1,2025-02-06,99038.0,bitcoin,1960398000000.0,42439410000.0,1.30476


## **Data description**

In [7]:
crypto.shape

(463, 8)

In [8]:
crypto.columns

Index(['id', 'cryptocurrency_id', 'date', 'price_usd', 'cryptocurrency_name',
       'market_cap', 'volume', 'change'],
      dtype='object')

In [9]:
crypto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   463 non-null    int64  
 1   cryptocurrency_id    463 non-null    int64  
 2   date                 463 non-null    object 
 3   price_usd            463 non-null    float64
 4   cryptocurrency_name  463 non-null    object 
 5   market_cap           463 non-null    float64
 6   volume               463 non-null    float64
 7   change               463 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 29.1+ KB


In [10]:
eliminate_columns(crypto, ["id", "cryptocurrency_id"])

Unnamed: 0,date,price_usd,cryptocurrency_name,market_cap,volume,change
0,2025-02-05,97864.000000,bitcoin,1.940957e+12,6.365736e+10,-1.048094
1,2025-02-05,2780.500000,ethereum,3.352538e+11,4.421126e+10,1.025537
2,2025-02-05,1.000000,tether,1.405307e+11,1.073244e+11,-0.038300
3,2025-02-05,571.030000,binancecoin,8.327533e+10,1.241403e+09,-2.180040
4,2025-02-06,99038.000000,bitcoin,1.960398e+12,4.243941e+10,1.304760
...,...,...,...,...,...,...
458,2025-06-20,105978.000000,bitcoin,2.106682e+12,2.138831e+10,0.921048
459,2025-06-20,2554.030000,ethereum,3.082086e+11,1.019583e+10,0.575543
460,2025-06-20,1.000000,tether,1.558786e+11,2.016412e+10,-0.012625
461,2025-06-20,647.780000,binancecoin,9.452889e+10,5.647578e+08,0.387885


In [11]:
crypto.duplicated().sum()

np.int64(0)

In [12]:
crypto.isnull().sum()

date                   0
price_usd              0
cryptocurrency_name    0
market_cap             0
volume                 0
change                 0
dtype: int64

## **Split dataset according to crytocurrencies**

In [13]:
crypto["cryptocurrency_name"].unique()

array(['bitcoin', 'ethereum', 'tether', 'binancecoin', 'usd-coin',
       'ripple'], dtype=object)

In [14]:
bitcoin = crypto[crypto["cryptocurrency_name"] == "bitcoin"]
ethereum = crypto[crypto["cryptocurrency_name"] == "ethereum"]
tether = crypto[crypto["cryptocurrency_name"] == "tether"]
binancecoin = crypto[crypto["cryptocurrency_name"] == "binancecoin"]
usdcoin = crypto[crypto["cryptocurrency_name"] == "usd-coin"]
ripple = crypto[crypto["cryptocurrency_name"] == "ripple"]

## **Create a register for each day**

I know that I don't have data for each day, so I am going to create this date and full with NAs

In [15]:
bitcoin = fill_dates(bitcoin)
bitcoin.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-05,97864.0,bitcoin,1940957000000.0,63657360000.0,-1.048094
2025-02-06,99038.0,bitcoin,1960398000000.0,42439410000.0,1.30476
2025-02-07,97683.0,bitcoin,1932662000000.0,46127890000.0,-1.056848
2025-02-08,,,,,
2025-02-09,,,,,


In [16]:
ethereum = fill_dates(ethereum)
ethereum.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-05,2780.5,ethereum,335253800000.0,44211260000.0,1.025537
2025-02-06,2825.73,ethereum,340841300000.0,30540690000.0,1.559076
2025-02-07,2753.45,ethereum,331887400000.0,30198320000.0,-2.207391
2025-02-08,,,,,
2025-02-09,,,,,


In [17]:
tether = fill_dates(tether)
tether.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-05,1.0,tether,140530700000.0,107324400000.0,-0.0383
2025-02-06,1.0,tether,141226200000.0,81109220000.0,0.003668
2025-02-07,1.0,tether,141406700000.0,67820900000.0,-0.010029
2025-02-08,,,,,
2025-02-09,,,,,


In [18]:
binancecoin = fill_dates(binancecoin)
binancecoin.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-05,571.03,binancecoin,83275330000.0,1241403000.0,-2.18004
2025-02-06,580.05,binancecoin,84578180000.0,471188900.0,1.49192
2025-02-07,586.11,binancecoin,85563000000.0,1075487000.0,1.422947
2025-02-08,,,,,
2025-02-09,,,,,


In [19]:
usdcoin = fill_dates(usdcoin)
usdcoin.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-06,1.0,usd-coin,55549030000.0,11227900000.0,0.004891
2025-02-07,0.99999,usd-coin,55748070000.0,10904410000.0,0.000362
2025-02-08,,,,,
2025-02-09,,,,,
2025-02-10,0.99995,usd-coin,56279340000.0,8631852000.0,-0.005034


In [20]:
ripple = fill_dates(ripple)
ripple.head()

Unnamed: 0_level_0,price_usd,cryptocurrency_name,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-07,2.42,ripple,139975400000.0,6647127000.0,-0.823918
2025-02-08,,,,,
2025-02-09,,,,,
2025-02-10,,,,,
2025-02-11,,,,,


### **Delete colums**

In [21]:
eliminate_columns(bitcoin, ["cryptocurrency_name"])
eliminate_columns(ethereum, ["cryptocurrency_name"])
eliminate_columns(tether, ["cryptocurrency_name"])
eliminate_columns(binancecoin, ["cryptocurrency_name"])
eliminate_columns(usdcoin, ["cryptocurrency_name"])
eliminate_columns(ripple, ["cryptocurrency_name"])

Unnamed: 0_level_0,price_usd,market_cap,volume,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-02-07,2.42,1.399754e+11,6.647127e+09,-0.823918
2025-02-08,,,,
2025-02-09,,,,
2025-02-10,,,,
2025-02-11,,,,
...,...,...,...,...
2025-05-18,,,,
2025-05-19,,,,
2025-05-20,,,,
2025-05-21,,,,


## **Export dataset**

In [22]:
export_dataset(bitcoin, "../data/raw/bitcoin.csv",  include_index = True)
export_dataset(ethereum, "../data/raw/ethereum.csv", include_index = True)
export_dataset(tether, "../data/raw/tether.csv", include_index = True)
export_dataset(binancecoin, "../data/raw/binancecoin.csv", include_index = True)
export_dataset(usdcoin, "../data/raw/usdcoin.csv", include_index = True)
export_dataset(ripple, "../data/raw/ripple.csv", include_index = True)

Data exported successfully to ../data/raw/bitcoin.csv
Data exported successfully to ../data/raw/ethereum.csv
Data exported successfully to ../data/raw/tether.csv
Data exported successfully to ../data/raw/binancecoin.csv
Data exported successfully to ../data/raw/usdcoin.csv
Data exported successfully to ../data/raw/ripple.csv
