In [None]:
"""
This dataset contains information about cryptocurrency prices, market capitalization, and other metrics. 
The data is collected from CoinMarketCap (https://coinmarketcap.com/), a popular website that tracks cryptocurrency prices.

This dataset can be used to:
- Analyze the price trends of different cryptocurrencies.
- Compare the market capitalization of different cryptocurrencies.
- Examine the circulating supply of different cryptocurrencies.
- Analyze the trading volume of different cryptocurrencies.
- Study the volatility of different cryptocurrencies.
- Compare the performance of different cryptocurrencies against each other or against a benchmark index.
- Identify correlations between different cryptocurrency prices.
- Use the data to build models to predict future prices or other trends.

+Info: https://www.kaggle.com/datasets/harshalhonde/coinmarketcap-cryptocurrency-dataset-2023
"""

In [None]:
import sys
import os
from utils.processing import DataLoader
from utils.analyzer import DataAnalyzer

---------------- LOAD DATASET -------------------#

In [None]:
# Dynamically detect the project's root directory
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.append(project_root)
print("Dynamically detected root directory:", project_root)

In [None]:
# Simplified dataset path
df_path = os.path.join(project_root, "dataSet")
print(f"Dataset path: {df_path}")
df = "currencies_data_Kaggle_2023_unique.csv"

In [None]:
# ---------------- LOAD AND ANALYZE DATA -------------------#
try:
    loader = DataLoader(df_path=df_path, df=df)
    df = loader.load_data()
    print("\n--- Dataset successfully loaded ---")
except FileNotFoundError as e:
    print(f"Error loading dataset: {e}")
    df = None
except ValueError as e:
    print(f"Dataset value error: {e}")
    df = None

In [None]:
# Proceed with analysis if data is loaded
if df is not None:
    # Instantiate the analyzer
    analyzer = DataAnalyzer(df)

    # Call analyzer methods to verify functionality
    analyzer.overview()
    analyzer.duplicates_analysis()
    analyzer.missing_values_analysis()  # Takes 7-10 minutes; please be patient...
    analyzer.data_types_analysis()
else:
    print("\n--- Could not load the dataset. Analysis aborted ---")

---------------- PROCESS DATA -------------------#

In [None]:
'''
We will handle dates, NaN values, and categorical variables
'''

In [None]:
if 'name.1' in df.columns:
    df.drop(columns=['name.1'], inplace=True)
    print("Column 'name.1' removed.")
    analyzer.data_types_analysis()

In [None]:
# Check columns with NaN values
nan_by_column = df.isnull().sum()
print(nan_by_column[nan_by_column > 0])
'''The column maxSupply contains all NaN values
and this is because the data is unavailable, so we will fill it with 0.'''

In [None]:
df.fillna(0, inplace=True)
print(f"Remaining NaN values: {df.isnull().sum().sum()}")  # Confirm no NaN values remain
analyzer.missing_values_analysis()

Convert dates to datetime format and prepare for time series analysis

In [None]:
import pandas as pd

In [None]:
# Convert date columns to datetime format
df['lastUpdated'] = pd.to_datetime(df['lastUpdated'], errors='coerce')
df['dateAdded'] = pd.to_datetime(df['dateAdded'], errors='coerce')
analyzer.overview()

In [None]:
# Create a temporal index without dropping the column dateAdded (in case we want to work with time series later)
df.set_index('dateAdded', inplace=True, drop=False)
print(df.index)

In [None]:
# Sort the DataFrame by the index (dateAdded)
df.sort_index(inplace=True)
print(df.index.is_monotonic_increasing)  # Should return True if sorted

In [None]:
# Create derived columns from 'dateAdded' to study when cryptocurrencies were added
df['year_added'] = df.index.year
df['month_added'] = df.index.month
df['day_added'] = df.index.day
df['weekday_added'] = df.index.weekday  # 0 = Monday, 6 = Sunday
print(df[['year_added', 'month_added', 'day_added', 'weekday_added']].head())
analyzer.overview()
df.head()

In [None]:
# Normalize the data to check for more duplicates
df['name'] = df['name'].str.strip().str.title()  # Title case for names
df['symbol'] = df['symbol'].str.strip().str.upper()  # Uppercase for symbols
print(df[['name', 'symbol']].head())

In [None]:
# Check for duplicates between 'name' and 'symbol'
duplicates = df[df.duplicated(subset=['name', 'symbol'], keep=False)]
print(duplicates)
print(f"Found duplicates: {duplicates.shape[0]}")
'''After normalizing to title case for names and uppercase for symbols,
we found 62 duplicates for Symbol, which corresponds to USD, indicating the value in dollars
for these cryptocurrencies as a pair value. This is not relevant, so we remove them, focusing on their symbol value.'''

In [None]:
# Handle the two remaining categorical variables: name and symbol
'''The strategy is as follows:
Create a dictionary mapping names to their LabelEncoder values.
This allows us to reference this file for future visualizations or mappings.'''

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Create a LabelEncoder for 'name'
le = LabelEncoder()
df['name_encoded'] = le.fit_transform(df['name'])

In [None]:
# Create a dictionary mapping 'name' -> 'name_encoded'
name_to_encoded = dict(zip(df['name'], df['name_encoded']))

In [None]:
# Verify the result
print("First encoded values:")
print(df[['name', 'name_encoded']].head())

In [None]:
# Save the dictionary to a CSV file
mapping_df = pd.DataFrame(list(name_to_encoded.items()), columns=['name', 'name_encoded'])
mapping_df.to_csv('EDA/CoinMarketCap/dataSet/name_encoded_mapping.csv', index=False)
print("Mapping dictionary created and saved as 'name_encoded_mapping.csv'")

In [None]:
# Remove columns 'name' and 'symbol'
df = df.drop(columns=['name', 'symbol'])
print("Remaining columns after removing 'name' and 'symbol':")
print(df.columns.tolist())

In [None]:
# Save the cleaned dataset ready for further analysis and/or training
df.to_csv('EDA/CoinMarketCap/dataSet/currencies_data_ready.csv', index=False)
print("Dataset ready and saved as 'currencies_data_ready.csv'")

-------> VISUALIZAMOS 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Filter data to remove rows with extreme outliers or missing values
filtered_df = df[(df['maxSupply'] > 0) & (df['price'] > 0)].copy()
filtered_df['maxSupply_jittered'] = filtered_df['maxSupply'] + np.random.uniform(-1e16, 1e16, size=len(filtered_df))

In [None]:
# Convert 'dateAdded' to datetime (if not already done)
filtered_df['dateAdded'] = pd.to_datetime(filtered_df['dateAdded'], errors='coerce')

In [None]:
# Initialize the plot
plt.figure(figsize=(14, 8))

In [None]:
# Scatter plot with jittered maxSupply
sns.scatterplot(
    data=filtered_df,
    x='dateAdded',
    y='maxSupply_jittered',
    hue='price',
    size='price',
    sizes=(20, 200),
    palette=sns.color_palette(['#4c72b0', '#55a868', '#c44e52']),  # Changed to a professional "Blues" palette
    alpha=0.8
)

In [None]:
# Enhance the plot with titles and labels
plt.title('Cryptocurrency Max Supply Over Time, Colored by Price (With Jitter)', fontsize=16, pad=20)
plt.xlabel('Date Added', fontsize=12)
plt.ylabel('Max Supply (Jittered)', fontsize=12)
plt.legend(title='Price ($)', fontsize=10, loc='upper left', bbox_to_anchor=(1.02, 1))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

In [None]:
# Show the plot
plt.show()