In [None]:
from sklearn.preprocessing import RobustScaler
from datetime import datetime
import pandas as pd
import numpy as np
import hashlib
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# Set up logging
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [None]:
filepath = "../../data/music_data.csv"
data = pd.read_csv(filepath)
df = data.copy()
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
filepath = "/home/martinson/Lhydra_rs/data/raw_data/enriched_synthetic_data.csv"
test_df = pd.read_csv(filepath)
data = test_df.copy()
data.columns
# check for missing release_dates
# data.head()

In [None]:
data["release_date"].isnull().sum()


In [None]:
# 1. correct encoding of user ids with hashing
# 2. correct encoding of music ids with hashing
# 3. verify number of unique users and songs match by shape

In [None]:
df["music"].value_counts()

In [None]:
# import numpy as np
# import pandas as pd

# # Assuming you have a DataFrame 'df' with columns 'music' and 'user_id'
# songs = df['music'].unique()
# users = df['user_id'].unique()

# # Create a DataFrame to store playcounts
# playcounts_df = pd.DataFrame(index=users, columns=songs)

# # Generate playcounts using Poisson distribution in a vectorized manner
# lambda_value = 5  # Desired average playcount
# playcounts = np.random.poisson(lam=lambda_value, size=(len(users), len(songs)))

# # Assign playcounts to the DataFrame
# playcounts_df[:] = playcounts

# # Display the playcounts DataFrame
# playcounts_df

In [None]:
# Melt the playcounts DataFrame to long format
playcounts_long = playcounts_df.reset_index().melt(
    id_vars="index", var_name="music", value_name="playcount"
)
playcounts_long.rename(columns={"index": "user_id"}, inplace=True)

# Merge the playcounts with the existing DataFrame
df_updated = df.merge(playcounts_long, on=["user_id", "music"], how="left")

# Display the updated DataFrame
df_updated

In [None]:
# plot distribution of playcount
plt.figure(figsize=(10, 6))
sns.histplot(df_updated["playcount"], kde=True, bins=30)
plt.title("Distribution of Playcount")
plt.xlabel("Playcount")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_updated.drop(columns=["plays"], inplace=True)

In [None]:
df = df_updated.copy()

In [None]:
# Step 1: Check for unique music_ids
unique_music_ids = df["music_id"].nunique()
total_rows = len(df)
if unique_music_ids != total_rows:
    logging.warning(
        f"Mismatch in music_ids: Expected {total_rows} unique IDs but found {unique_music_ids}"
    )
    # Assuming we need to rehash music_ids
    df["music_id"] = df["music"].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
    logging.info("Rehashed music_ids to ensure uniqueness.")
else:
    logging.info("All music_ids are unique.")

# Debug: Show the updated data after rehashing music_ids (if needed)
print("\nData after music_id rehashing:")
df.head()

# Step 2: Restructure user_ids
unique_users = df["user_id"].unique()
user_id_mapping = {
    old_id: new_id for new_id, old_id in enumerate(unique_users, start=1)
}

# Apply the mapping to create new user_ids
df["new_user_id"] = df["user_id"].map(user_id_mapping)

# Debug: Show the updated data after restructuring user_ids
print("\nData after user_id restructuration:")
df.head()


# Optional Step 3: Hash new_user_ids for robustness
def hash_user_id(user_id):
    return hashlib.sha256(str(user_id).encode()).hexdigest()


df["hashed_user_id"] = df["new_user_id"].apply(hash_user_id)

# Debug: Show the final data after hashing user_ids (if needed)
print("\nFinal data after hashing user_ids:")
df.head()

In [None]:
df["music_id"].nunique(), df["music"].nunique()
print(
    f'Number of unique music ids: {df["music_id"].nunique()}\nNumber of unique songs: {df["music"].nunique()}'
)

In [None]:
df.drop(columns=["user_id", "new_user_id"], inplace=True)

In [None]:
# rename hased_user_id to user_id
df.rename(columns={"hashed_user_id": "user_id"}, inplace=True)
df.head()

### Trying genre extraction with spotify api

In [None]:
df["genre"].value_counts()

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

missing_genres = df[df["genre"].isin(["Unknown", "Other", np.nan])]

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import configparser
import time
from requests.exceptions import RequestException

# Read and clean credentials
config = configparser.ConfigParser()
config.read("config.ini")

client_id = config["SPOTIFY"]["CLIENT_ID"].strip("'")  # Remove quotes
client_secret = config["SPOTIFY"]["CLIENT_SECRET"].strip("'")


def create_spotify_client(max_retries=3):
    for attempt in range(max_retries):
        try:
            client_credentials_manager = SpotifyClientCredentials(
                client_id=client_id, client_secret=client_secret
            )
            return spotipy.Spotify(
                client_credentials_manager=client_credentials_manager
            )
        except Exception as e:
            if attempt == max_retries - 1:
                raise Exception(f"Failed to authenticate with Spotify: {e}")
            time.sleep(1)


def get_genre_from_spotify(artist_name, track_name, sp, max_retries=3):
    for attempt in range(max_retries):
        try:
            results = sp.search(
                q=f"artist:{artist_name} track:{track_name}", type="track", limit=1
            )
            if not results["tracks"]["items"]:
                return None

            track = results["tracks"]["items"][0]
            artist_id = track["artists"][0]["id"]
            artist = sp.artist(artist_id)
            return artist["genres"][0] if artist["genres"] else None

        except RequestException as e:
            if attempt == max_retries - 1:
                print(f"Error fetching data from Spotify: {e}")
                return None
            time.sleep(1)


# Initialize Spotify client
sp = create_spotify_client()

# Process missing genres with retry logic
missing_genres["genre_spotify"] = missing_genres.apply(
    lambda row: get_genre_from_spotify(row["artist_name"], row["music"], sp), axis=1
)

# Merge results
df = df.merge(
    missing_genres[["music", "artist_name", "genre_spotify"]],
    on=["music", "artist_name"],
    how="left",
)
df["genre"] = df["genre"].fillna(df["genre_spotify"])
df = df.drop("genre_spotify", axis=1)

In [None]:
df = df.merge(
    missing_genres[["music", "artist_name", "genre_spotify"]],
    on=["music", "artist_name"],
    how="left",
)
# df['genre'] = df['genre'].fillna(df['genre_spotify'])
# df = df.drop('genre_spotify', axis=1)

In [None]:
df.columns

In [None]:
# df.drop(columns=['genre_spotify_x','genre_spotify_y'], inplace=True)

In [None]:
# 1. Create missing genres mask
missing_mask = df["genre"].isin(["Unknown", "Other"]) | df["genre"].isna()

# 2. Get subset for Spotify API calls
missing_df = df[missing_mask].copy()

# 3. Get Spotify genres
# missing_df['genre_spotify'] = missing_df.apply(
#     lambda row: get_genre_from_spotify(row['artist_name'], row['music']),
#     axis=1
# )
missing_df = missing_genres.copy()

# 4. Create update mapping
genre_updates = missing_df[["music", "artist_name", "genre_spotify"]].dropna()

# 5. Update original DataFrame using merge
# df = df.merge(
#     genre_updates,
#     on=['music', 'artist_name'],
#     how='left'
# )

# 6. Update genre column with new values where applicable
df["genre"] = df.apply(
    lambda row: (
        row["genre_spotify"]
        if pd.isnull(row["genre"]) or row["genre"] in ["Unknown", "Other"]
        else row["genre"]
    ),
    axis=1,
)

# 7. Clean up
# df = df.drop('genre_spotify', axis=1)

# 8. Verify results
print(f"Unique genres: {df['genre'].nunique()}")
print(f"Missing genres: {df['genre'].isna().sum()}")

In [None]:
df.head()

In [None]:
df.to_csv("../../data/synthetic_cleaned_data.csv", index=False)

In [None]:
# fill missing genre with "Unknown"
df["genre"] = df["genre"].fillna("Unknown")

In [None]:
df["genre"].value_counts()

In [None]:
unique_genres = df["genre"].unique()

In [None]:
# plot distribution of genre
plt.figure(figsize=(10, 6))
sns.countplot(y=df["genre"], order=df["genre"].value_counts().index)
plt.title("Distribution of Genre")
plt.xlabel("Frequency")
plt.ylabel("Genre")
plt.show()

In [None]:
# missing_genres['genre_spotify'].value_counts()

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

# List of genre names

# Vectorize the genre names using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(unique_genres)

# Perform hierarchical clustering
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5)
clustering.fit(X.toarray())

# Get the cluster labels for each genre
labels = clustering.labels_

# Group the unique_genres.tolist()) by their cluster labels
from collections import defaultdict

genre_groups = defaultdict(list)
for genre, label in zip((unique_genres), labels):
    genre_groups[label].append(genre)

# Print the genre groups
for label, group in genre_groups.items():
    print(f"Cluster {label}: {', '.join(group)}")

# how many clusters
n_clusters = len(set(labels))

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict


def cluster_genres(genres, n_clusters=20, plot=True):
    """
    Cluster genres and visualize relationships
    """
    # Vectorize genres
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(genres)

    # Cluster
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X.toarray())

    # Group genres
    genre_groups = defaultdict(list)
    for genre, label in zip(genres, labels):
        genre_groups[label].append(genre)

    if plot:
        # Create graph
        G = nx.Graph()

        # Add nodes and edges
        for label, group in genre_groups.items():
            for genre in group:
                G.add_node(genre, cluster=label)
                for other_genre in group:
                    if genre != other_genre:
                        G.add_edge(genre, other_genre)

        # Plot
        plt.figure(figsize=(15, 10))
        pos = nx.spring_layout(G)
        colors = [G.nodes[node]["cluster"] for node in G.nodes()]

        nx.draw_networkx(
            G,
            pos,
            node_color=colors,
            node_size=1000,
            font_size=8,
            cmap=plt.cm.tab20,
            with_labels=True,
        )

        plt.title(f"Genre Clusters (n={n_clusters})")
        plt.axis("off")
        plt.show()

    return genre_groups


# Usage
unique_genres = df["genre"].unique()
clusters = cluster_genres(unique_genres, n_clusters=20)

# Print clusters
for label, genres in clusters.items():
    print(f"\nCluster {label}:")
    print(", ".join(genres))

In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import umap
import hdbscan
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer


def cluster_genres_advanced(genres, n_neighbors=15, min_cluster_size=5):
    """
    Advanced genre clustering using Word2Vec + UMAP + HDBSCAN
    """
    # 1. Prepare text data
    genres = [str(g).lower() for g in genres if str(g) != "nan"]

    # 2. Get embeddings using SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(genres)

    # 3. Dimensionality reduction
    umap_reducer = umap.UMAP(
        n_neighbors=n_neighbors, min_dist=0.0, n_components=2, random_state=42
    )
    umap_embeddings = umap_reducer.fit_transform(embeddings)

    # 4. Clustering
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size, min_samples=1, prediction_data=True
    )
    cluster_labels = clusterer.fit_predict(umap_embeddings)

    # 5. Create DataFrame for visualization
    viz_df = pd.DataFrame(
        {
            "genre": genres,
            "x": umap_embeddings[:, 0],
            "y": umap_embeddings[:, 1],
            "cluster": cluster_labels,
        }
    )

    # 6. Interactive visualization
    fig = px.scatter(
        viz_df,
        x="x",
        y="y",
        color="cluster",
        hover_data=["genre"],
        title="Genre Clusters",
        template="plotly_dark",
    )

    # 7. Group results
    clusters = {}
    for label in set(cluster_labels):
        if label != -1:  # Exclude noise points
            clusters[f"Cluster_{label}"] = viz_df[viz_df["cluster"] == label][
                "genre"
            ].tolist()

    return clusters, fig


# Usage
unique_genres = df["genre"].dropna().unique()
clusters, fig = cluster_genres_advanced(unique_genres)

# Display interactive plot
fig.show()

# Print clusters
for name, genres in clusters.items():
    print(f"\n{name}:")
    print(", ".join(genres))

In [None]:
pd.set_option("display.max_rows", None)
df["genre"].value_counts()

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict


class GenreClustering:
    MAIN_GENRES = {
        "rock": ["rock", "metal", "punk", "alternative", "grunge", "indie"],
        "electronic": ["electronic", "dance", "edm", "house", "techno", "dubstep"],
        "hip_hop": ["hip-hop", "rap", "trap", "drill"],
        "pop": ["pop", "teen pop", "dance pop", "k-pop"],
        "classical": ["classical", "baroque", "orchestra"],
        "jazz": ["jazz", "bebop", "fusion"],
        "folk": ["folk", "acoustic", "singer-songwriter"],
        "rb_soul": ["rnb", "r&b", "soul", "motown"],
        "country": ["country", "bluegrass", "americana"],
        "world": ["latin", "reggae", "afrobeat", "world"],
        "religious": ["christian", "gospel", "spiritual", "worship"],
    }

    def __init__(self, genres_series):
        self.genres = genres_series
        self.genre_map = self._create_genre_map()

    def _create_genre_map(self):
        """Create mapping of subgenres to main genres"""
        genre_map = {}
        for main_genre, subgenres in self.MAIN_GENRES.items():
            for subgenre in subgenres:
                genre_map[subgenre] = main_genre
        return genre_map

    def classify_genre(self, genre):
        """Map a genre to its main category"""
        genre = str(genre).lower()
        for key_term, main_genre in self.genre_map.items():
            if key_term in genre:
                return main_genre
        return "other"

    def cluster_genres(self):
        """Group genres into main categories"""
        clustered = defaultdict(list)
        counts = defaultdict(int)

        for genre, count in self.genres.items():
            main_genre = self.classify_genre(genre)
            clustered[main_genre].append((genre, count))
            counts[main_genre] += count

        return clustered, counts


# Usage
genres_series = pd.Series(
    {
        genre: count
        for genre, count in zip(
            df["genre"].value_counts().index, df["genre"].value_counts().values
        )
    }
)
clusterer = GenreClustering(genres_series)
clusters, counts = clusterer.cluster_genres()

# Print results
for main_genre, subgenres in clusters.items():
    print(f"\n## {main_genre.upper()} (Total: {counts[main_genre]})")
    for subgenre, count in sorted(subgenres, key=lambda x: x[1], reverse=True):
        print(f"- {subgenre}: {count}")

In [None]:
genres_series

In [None]:
from fuzzywuzzy import process
import pandas as pd
import numpy as np


class GenreProcessor:
    GENRE_HIERARCHY = {
        "rock": ["rock", "metal", "punk", "alternative", "grunge", "indie"],
        "electronic": ["electronic", "dance", "edm", "house", "techno", "dubstep"],
        "hip_hop": ["hip-hop", "rap", "trap", "drill"],
        "pop": ["pop", "teen pop", "dance pop", "k-pop"],
        "classical": ["classical", "baroque", "orchestra"],
        "jazz": ["jazz", "bebop", "fusion"],
        "folk": ["folk", "acoustic", "singer-songwriter"],
        "rb_soul": ["rnb", "r&b", "soul", "motown"],
        "country": ["country", "bluegrass", "americana"],
        "world": ["latin", "reggae", "afrobeat", "world"],
        "religious": ["christian", "gospel", "spiritual", "worship"],
    }

    def __init__(self, df):
        self.df = df.copy()
        self.genre_map = self._create_full_genre_map()

    def _create_full_genre_map(self):
        """Create comprehensive genre mapping"""
        genre_map = {}
        for main_genre, subgenres in self.GENRE_HIERARCHY.items():
            for subgenre in subgenres:
                genre_map[subgenre] = main_genre
        return genre_map

    def get_main_genre(self, genre):
        """Map any genre to main category using fuzzy matching"""
        if pd.isna(genre):
            return "unknown"
        genre = str(genre).lower()
        matches = process.extractBests(genre, self.genre_map.keys(), score_cutoff=60)
        if matches:
            return self.genre_map[matches[0][0]]
        return "other"

    def process_genres(self):
        """Add main genre column to DataFrame"""
        self.df["main_genre"] = self.df["genre"].apply(self.get_main_genre)
        return self.df

    def match_user_genre(self, user_input):
        """Match user input genre to known genres"""
        user_genre = str(user_input).lower()
        main_genre = self.get_main_genre(user_genre)
        return {
            "input_genre": user_genre,
            "main_genre": main_genre,
            "similar_genres": [k for k, v in self.genre_map.items() if v == main_genre],
        }


# Usage Example
processor = GenreProcessor(df)
df_processed = processor.process_genres()

# Example of handling user input
user_genre = "indie rock"
genre_info = processor.match_user_genre(user_genre)
print(f"Main Genre: {genre_info['main_genre']}")
print(f"Similar Genres: {', '.join(genre_info['similar_genres'])}")

In [None]:
df_processed["main_genre"].value_counts()

In [None]:
df = df_processed.copy()
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import hashlib
# import logging
# from datetime import datetime

# # Set up logging
# logging.basicConfig(
#     level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
# )

# df = df_processed.copy()

# # Step 1: Check for unique music_ids
# unique_music_ids = df["music_id"].nunique()
# total_rows = len(df)
# if unique_music_ids != total_rows:
#     logging.warning(
#         f"Mismatch in music_ids: Expected {total_rows} unique IDs but found {unique_music_ids}"
#     )
#     # Assuming we need to rehash music_ids
#     df["music_id"] = df["music"].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
#     logging.info("Rehashed music_ids to ensure uniqueness.")
# else:
#     logging.info("All music_ids are unique.")

# # Debug: Show the updated data after rehashing music_ids (if needed)
# print("\nData after music_id rehashing:")
# print(df.head())

# # Step 2: Restructure user_ids
# # unique_users = df["user_id"].unique()
# # user_id_mapping = {
# #     old_id: new_id for new_id, old_id in enumerate(unique_users, start=1)
# # }

# # # Apply the mapping to create new user_ids
# # df["new_user_id"] = df["user_id"].map(user_id_mapping)

# # # Debug: Show the updated data after restructuring user_ids
# # print("\nData after user_id restructuration:")
# # print(df.head())


# # Optional Step 3: Hash new_user_ids for robustness
# # def hash_user_id(user_id):
# #     return hashlib.sha256(str(user_id).encode()).hexdigest()


# # df["hashed_user_id"] = df["new_user_id"].apply(hash_user_id)

# # # Debug: Show the final data after hashing user_ids (if needed)
# # print("\nFinal data after hashing user_ids:")
# # print(df.head())

# # Step 4: Check for class imbalances
# categorical_columns = ["gender", "main_genre", "explicit"]

# for column in categorical_columns:
#     plt.figure(figsize=(8, 6))
#     sns.countplot(x=column, data=df)
#     plt.title(f"Distribution of {column}")
#     plt.show()

# # Step 5: Handle missing values
# # Check for missing values in each column
# missing_values = df.isnull().sum()
# print("\nMissing Values in Each Column:")
# print(missing_values)

# # Handle missing values in 'genre' and 'featured_artists'
# # Example: Fill 'Unknown' with 'Other'
# # df["genre"].replace("Unknown", "Other", inplace=True)
# df["featured_artists"].fillna("None", inplace=True)

# # Debug: Show the updated data after handling missing values
# print("\nData after handling missing values:")
# print(df.head())

# # Step 6: Data Type Conversion
# # Convert 'release_date' to datetime
# df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# # Debug: Show the updated data types
# print("\nData Types after conversion:")
# print(df.dtypes)

# # Step 7: Data Consistency
# # Check for duplicate entries
# duplicates = df.duplicated().sum()
# print(f"\nNumber of duplicate entries: {duplicates}")

# # Check for unexpected values in 'release_year'
# if (df["release_year"] < 1900).any() or (
#     df["release_year"] > datetime.now().year
# ).any():
#     logging.warning("Unexpected values found in 'release_year'.")

# # Additional feature engineering can be done here if necessary

In [None]:
df["featured_artists"].fillna("None", inplace=True)

In [None]:
df.columns

In [None]:
df.drop(columns=["genre_spotify", "genre"], inplace=True)

In [None]:
df.to_csv("../../data/synthetic_cleaned_data_v2.csv", index=False)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df = pd.read_csv("../../data/synthetic_cleaned_data_v2.csv")

In [None]:
# List of columns to convert to integer
todo = ["playcount", "age"]


def convert_to_int(columns, dataframe):
    # Filter columns that exist in the DataFrame
    existing_columns = [col for col in columns if col in dataframe.columns]

    for col in existing_columns:
        try:
            dataframe[col] = dataframe[col].astype(int)
        except ValueError as e:
            print(f"Error converting column {col} to integer: {e}")

    return dataframe


# Convert specified columns to integer
df = convert_to_int(todo, df)
print(df.dtypes)

In [None]:
# Assuming df is already defined and contains the cleaned data from previous steps

# Step 1: Descriptive Statistics
numeric_columns = df.select_dtypes(
    include=[
        np.number,
    ]
).columns
descriptive_stats = df[numeric_columns].describe()
print("\nDescriptive Statistics for Numeric Columns:")
print(descriptive_stats)

# Step 2: Box Plots
plt.figure(figsize=(16, 12))

# Calculate the number of rows needed
num_columns = len(numeric_columns)
num_rows = (num_columns + 3) // 4  # 4 columns per row

# Plotting box plots for each numeric column
for i, column in enumerate(numeric_columns, 1):
    plt.subplot(num_rows, 4, i)
    sns.boxplot(y=df[column])
    plt.title(f"Boxplot of {column}")
    plt.ylabel(column)

plt.tight_layout()
plt.show()

# Step 3: Correlation Analysis
correlation_matrix = df[numeric_columns].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

plt.figure(figsize=(12, 8))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    cbar_kws={"label": "Correlation Coefficient"},
)
plt.title("Correlation Matrix Heatmap")
plt.show()


# Step 4: Outlier Detection Using IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers


# Detecting outliers for each numeric column
outliers = {}
for column in numeric_columns:
    outliers[column] = detect_outliers_iqr(df, column)
    if not outliers[column].empty:
        logging.warning(f"Outliers detected in {column}:")
        print(outliers[column])


# Step 5: Handle Outliers (Optional)
# For demonstration, let's handle outliers by capping them at the 1.5*IQR range
def cap_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df


# Capping outliers for each numeric column
for column in numeric_columns:
    df = cap_outliers_iqr(df, column)

# Debug: Show the updated data after capping outliers
print("\nData after capping outliers:")
print(df.head())

In [None]:
data.shape, df.shape
print(
    f"Original data shape: {data.shape}\nDataframe shape after processing: {df.shape}"
)

In [None]:
import numpy as np
import pandas as pd
import powerlaw
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns


class PlaycountGenerator:
    def __init__(self, df):
        self.df = df
        self.n_tracks = len(df)  # Use total number of rows instead of unique tracks

    def power_law(self, alpha=2.0, xmin=1):
        playcounts_raw = powerlaw.Power_Law(
            xmin=xmin, parameters=[alpha]
        ).generate_random(self.n_tracks)
        return np.round(playcounts_raw).astype(int)

    def log_normal(self, mu=5, sigma=1.5):
        return np.round(
            np.exp(stats.norm.rvs(loc=mu, scale=sigma, size=self.n_tracks))
        ).astype(int)

    def combined_factors(
        self,
        energy_factor_weight=0.5,
        danceability_factor_weight=0.3,
        alpha=2.0,
        xmin=1,
    ):
        base_playcounts = np.round(
            powerlaw.Power_Law(xmin=xmin, parameters=[alpha]).generate_random(
                self.n_tracks
            )
        ).astype(int)
        energy_factor = (
            1
            + (self.df["energy"] - self.df["energy"].min())
            / (self.df["energy"].max() - self.df["energy"].min())
            * energy_factor_weight
        )
        danceability_factor = (
            1
            + (self.df["danceability"] - self.df["danceability"].min())
            / (self.df["danceability"].max() - self.df["danceability"].min())
            * danceability_factor_weight
        )
        return np.round(base_playcounts * energy_factor * danceability_factor).astype(
            int
        )

    def negative_binomial(self, n=5, p=0.3):
        return stats.nbinom.rvs(n, p, size=self.n_tracks)

    def add_noise(self, playcounts, noise_std=0.2):
        noise = np.random.normal(0, noise_std, self.n_tracks)
        return np.round(playcounts * (1 + noise)).astype(int)

    def evaluate(self, playcounts, method_name):
        self.df["playcount"] = playcounts
        print(f"Evaluation for {method_name}:")
        print(
            self.df[["age", "duration", "energy", "danceability", "playcount"]].corr()
        )  # correlation with other features
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(self.df["playcount"], kde=True)
        plt.title(f"{method_name} Playcount Distribution")
        plt.subplot(1, 2, 2)
        plt.scatter(self.df.index, self.df["playcount"])
        plt.title(f"{method_name} Playcount Scatter Plot")
        plt.tight_layout()
        plt.show()
        print("\n")


# Example usage:
# Assuming you have your DataFrame 'df'
generator = PlaycountGenerator(df)

# Generate playcounts using different methods
playcounts_powerlaw = generator.power_law()
playcounts_lognormal = generator.log_normal()
playcounts_combined = generator.combined_factors()
playcounts_negbin = generator.negative_binomial()

# Evaluate each method
generator.evaluate(playcounts_powerlaw, "Power Law")
generator.evaluate(playcounts_lognormal, "Log Normal")
generator.evaluate(playcounts_combined, "Combined Factors")
generator.evaluate(playcounts_negbin, "Negative Binomial")

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


class PlaycountGenerator:
    def __init__(self, df):
        self.df = df
        self.features = [
            "age",
            "duration",
            "energy",
            "danceability",
            "acousticness",
            "valence",
            "tempo",
        ]

    def analyze_correlations(self):
        """
        Analyze correlations between features and existing playcounts
        Returns correlation matrix and plots heatmap
        """
        corr_matrix = self.df[self.features + ["playcount"]].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
        plt.title("Feature Correlations with Playcount")
        plt.show()
        return corr_matrix

    def generate_poisson(self, lambda_param=10):
        """
        Generate playcounts using Poisson distribution
        Params:
            lambda_param: Mean playcount value
        """
        return np.random.poisson(lambda_param, len(self.df))

    def generate_negative_binomial(self, n=5, p=0.5):
        """
        Generate playcounts using Negative Binomial
        Useful for overdispersed count data
        """
        return np.random.negative_binomial(n, p, len(self.df))

    def generate_zero_inflated(self, lambda_param=10, zero_prob=0.2):
        """
        Generate zero-inflated playcounts
        Combines zeros with Poisson distribution
        """
        zeros = np.random.binomial(1, zero_prob, len(self.df))
        counts = self.generate_poisson(lambda_param)
        return counts * (1 - zeros)

    def generate_feature_weighted(self):
        """
        Generate playcounts weighted by feature correlations
        Uses feature importance to influence playcount distribution
        """
        features_norm = stats.zscore(self.df[self.features])
        weights = np.abs(np.corrcoef(features_norm.T)[-1])
        base_counts = np.random.poisson(10, len(self.df))
        weighted_counts = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        return np.maximum(weighted_counts, 0).astype(int)

    def plot_distributions(self, n_samples=1000):
        """
        Plot different playcount distributions
        """
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        distributions = {
            "Poisson": self.generate_poisson(),
            "Negative Binomial": self.generate_negative_binomial(),
            "Zero-inflated": self.generate_zero_inflated(),
            "Feature-weighted": self.generate_feature_weighted(),
        }

        for (title, counts), ax in zip(distributions.items(), axes.ravel()):
            sns.histplot(counts, ax=ax)
            ax.set_title(f"{title} Distribution")
            ax.set_xlabel("Playcount")

        plt.tight_layout()
        plt.show()

        return distributions

# Usage
generator = PlaycountGenerator(df)
correlations = generator.analyze_correlations()
distributions = generator.plot_distributions()

# Select best distribution based on data characteristics
df["synthetic_playcount"] = generator.generate_feature_weighted()

In [None]:
class PlaycountGenerator:
    def __init__(self, df):
        self.df = df
        self.features = ['age', 'duration', 'energy', 'danceability', 
                        'acousticness', 'valence', 'tempo']

    def analyze_correlations(self):
        """
        Analyze correlations between features and existing playcounts
        Returns correlation matrix and plots heatmap
        """
        corr_matrix = self.df[self.features + ['playcount']].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
        plt.title('Feature Correlations with Playcount')
        plt.show()
        return corr_matrix
    
    def generate_poisson(self, lambda_param=10):
        """
        Generate playcounts using Poisson distribution
        Params:
            lambda_param: Mean playcount value
        """
        return np.random.poisson(lambda_param, len(self.df))
    
    def generate_negative_binomial(self, n=5, p=0.5):
        """
        Generate playcounts using Negative Binomial
        Useful for overdispersed count data
        """
        return np.random.negative_binomial(n, p, len(self.df))
    
    def generate_zero_inflated(self, lambda_param=10, zero_prob=0.2):
        """
        Generate zero-inflated playcounts
        Combines zeros with Poisson distribution
        """
        zeros = np.random.binomial(1, zero_prob, len(self.df))
        counts = self.generate_poisson(lambda_param)
        return counts * (1 - zeros)
    
    def generate_feature_weighted(self):
        """
        Generate playcounts weighted by feature correlations
        Uses feature importance to influence playcount distribution
        """
        features_norm = stats.zscore(self.df[self.features])
        weights = np.abs(np.corrcoef(features_norm.T)[-1])
        base_counts = np.random.poisson(10, len(self.df))
        weighted_counts = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        return np.maximum(weighted_counts, 0).astype(int)
    
    def generate_user_aware_counts(self):
        """
        Generate playcounts considering user behavior patterns
        """
        # User activity factors (some users listen more than others)
        user_factors = np.random.normal(1, 0.2, len(np.unique(self.df['user_id'])))
        
        # Base song popularity using feature weights
        features_norm = stats.zscore(self.df[self.features])
        weights = np.abs(np.corrcoef(features_norm.T)[-1])
        base_counts = np.random.poisson(10, len(self.df))
        song_popularity = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        
        # Map user factors to each song play
        user_indices = pd.factorize(self.df['user_id'])[0]
        weighted_counts = (song_popularity * user_factors[user_indices])
        
        return np.maximum(weighted_counts, 0).astype(int)
    
    def plot_comparison(self):
        """
        Compare original and user-aware playcount distributions
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        original_counts = self.generate_feature_weighted()
        user_aware_counts = self.generate_user_aware_counts()
        
        sns.histplot(original_counts, ax=ax1)
        ax1.set_title('Feature-weighted Distribution')
        ax1.set_xlabel('Playcount')
        
        sns.histplot(user_aware_counts, ax=ax2)
        ax2.set_title('User-aware Distribution')
        ax2.set_xlabel('Playcount')
        
        plt.tight_layout()
        plt.show()
        
        return original_counts, user_aware_counts

# Test the new implementation
generator = PlaycountGenerator(df)
original, user_aware = generator.plot_comparison()

# Basic statistics comparison
print("\nDistribution Statistics:")
print(pd.DataFrame({
    'Original': [original.mean(), original.std(), np.percentile(original, 95)],
    'User-aware': [user_aware.mean(), user_aware.std(), np.percentile(user_aware, 95)]
}, index=['Mean', 'Std', '95th percentile']))

In [None]:
class PlaycountGenerator:
    def __init__(self, df):
        self.df = df
        self.features = ['duration', 'energy', 'danceability', 
                        'acousticness', 'valence', 'tempo']
    
    def analyze_correlations(self):
        """
        Analyze correlations between features and existing playcounts
        Returns correlation matrix and plots heatmap
        """
        corr_matrix = self.df[self.features + ['playcount']].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
        plt.title('Feature Correlations with Playcount')
        plt.show()
        return corr_matrix
    
    def generate_poisson(self, lambda_param=10):
        """
        Generate playcounts using Poisson distribution
        Params:
            lambda_param: Mean playcount value
        """
        return np.random.poisson(lambda_param, len(self.df))
    
    def generate_negative_binomial(self, n=5, p=0.5):
        """
        Generate playcounts using Negative Binomial
        Useful for overdispersed count data
        """
        return np.random.negative_binomial(n, p, len(self.df))
    
    def generate_zero_inflated(self, lambda_param=10, zero_prob=0.2):
        """
        Generate zero-inflated playcounts
        Combines zeros with Poisson distribution
        """
        zeros = np.random.binomial(1, zero_prob, len(self.df))
        counts = self.generate_poisson(lambda_param)
        return counts * (1 - zeros)
    
    def generate_feature_weighted(self):
        """
        Generate playcounts weighted by feature correlations
        Uses feature importance to influence playcount distribution
        """
        features_norm = stats.zscore(self.df[self.features])
        weights = np.abs(np.corrcoef(features_norm.T)[-1])
        base_counts = np.random.poisson(10, len(self.df))
        weighted_counts = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        return np.maximum(weighted_counts, 0).astype(int)
    
    def generate_user_aware_counts(self):
        """
        Generate playcounts considering user behavior patterns
        """
        # User activity factors (some users listen more than others)
        user_factors = np.random.normal(1, 0.2, len(np.unique(self.df['user_id'])))
        
        # Base song popularity using feature weights
        features_norm = stats.zscore(self.df[self.features])
        weights = np.abs(np.corrcoef(features_norm.T)[-1])
        base_counts = np.random.poisson(10, len(self.df))
        song_popularity = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        
        # Map user factors to each song play
        user_indices = pd.factorize(self.df['user_id'])[0]
        weighted_counts = (song_popularity * user_factors[user_indices])
        
        return np.maximum(weighted_counts, 0).astype(int)
    
    def generate_enhanced_playcounts(self):
        # Age factor
        current_date = pd.Timestamp.now()
        release_dates = pd.to_datetime(self.df['age'])
        song_age = (current_date - release_dates).dt.days / 365.25
        age_factor = np.exp(-0.5 * song_age)
        
        # Genre popularity
        genre_popularity = self.df.groupby('main_genre')['playcount'].mean()
        genre_factor = self.df['main_genre'].map(genre_popularity)
        genre_factor = (genre_factor - genre_factor.min()) / (genre_factor.max() - genre_factor.min())
        
        # User factors
        user_factors = np.random.normal(1, 0.2, len(np.unique(self.df['user_id'])))
        user_indices = pd.factorize(self.df['user_id'])[0]
        
        # Base counts with features
        features_norm = stats.zscore(self.df[self.features])
        base_counts = np.random.poisson(10, len(self.df))
        song_popularity = base_counts * (1 + 0.2 * features_norm.mean(axis=1))
        
        # Combine all factors
        final_counts = (song_popularity * 
                       user_factors[user_indices] * 
                       (1 + 0.3 * age_factor) * 
                       (1 + 0.2 * genre_factor))
        
        return np.maximum(final_counts, 0).astype(int)
    
    def plot_comparison(self):
        fig, axes = plt.subplots(1, 3, figsize=(20, 6))
        
        base_counts = self.generate_feature_weighted()
        user_aware = self.generate_user_aware_counts()
        enhanced = self.generate_enhanced_playcounts()
        
        distributions = {
            'Base': base_counts,
            'User-aware': user_aware,
            'Enhanced': enhanced
        }
        
        for (title, counts), ax in zip(distributions.items(), axes):
            sns.histplot(counts, ax=ax)
            ax.set_title(f'{title} Distribution')
            ax.set_xlabel('Playcount')
        
        plt.tight_layout()
        
        # Statistics comparison
        stats_df = pd.DataFrame({
            'Base': [base_counts.mean(), base_counts.std(), np.percentile(base_counts, 95)],
            'User-aware': [user_aware.mean(), user_aware.std(), np.percentile(user_aware, 95)],
            'Enhanced': [enhanced.mean(), enhanced.std(), np.percentile(enhanced, 95)]
        }, index=['Mean', 'Std', '95th percentile'])
        
        return stats_df

# Run comparison
generator = PlaycountGenerator(df)
comparison_stats = generator.plot_comparison()
print("\nDistribution Statistics:")
print(comparison_stats)

In [None]:
df["enhanced_playcount"] = generator.generate_enhanced_playcounts()

In [None]:
df.head(1)

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
if 'enhanced_playcount' in df.columns:
    aggregated_data = df.groupby(['user_id', 'music_id']).agg({"enhanced_playcount": 'sum'}).reset_index()
    
aggregated_data.head()
    
# apply to the original data


In [None]:
df = df.merge(aggregated_data, on=['user_id', 'music_id'], how='left')

In [None]:
df.head()

In [None]:
df.drop(columns=["enhanced_playcount_x","plays"], inplace=True)
df.rename(columns={"enhanced_playcount_y": "playcount"}, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.to_csv("../../data/o1_data.csv", index=False)

In [None]:
df = pd.read_csv("../../data/o1_data.csv")

In [None]:
# check for duplicates in the data
duplicates = df.duplicated().sum()
print(f"Number of duplicate entries: {duplicates}")

In [None]:
exact_duplicates = df[df.duplicated(keep=False)]
print(f"Number of exact duplicates: {exact_duplicates.shape[0]}")

In [None]:
exact_duplicates.head()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
# assert if duplicates are gone
duplicates = df.duplicated().sum()
print(f"Number of duplicate entries: {duplicates}")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# drop first playcount column, should be next to synthetic_playcount
df.drop(columns=["playcount","synthetic_playcount"], inplace=True)
# rename id_artist to artist_id
df.rename(columns={"id_artist": "artist_id"}, inplace=True)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# rename playcount.1 to playcount
df.rename(columns={"playcount.1": "playcount"}, inplace=True)

In [None]:
# rename id_artists to artist_id
df.rename(columns={"id_artists": "artist_id"}, inplace=True)

In [None]:
# check for equal name/id matches in the data
names =['artist_name', 'music']
ids = ['artist_id', 'music_id']

def check_name_id_match(df, names, ids):
    for name, id in zip(names, ids):
        name_id_match = df.groupby(name)[id].nunique()
        if name_id_match.max() > 1:
            print(f"Warning: {name} has multiple {id} entries.")
        else:
            print(f"Name and {id} match is consistent.")

check_name_id_match(df, names, ids)

In [None]:
# fix artist_name and artist_id mismatch
duplicate_artists = df['artist_name'].duplicated().sum()
duplicate_artist_ids = df['artist_id'].duplicated().sum()
print(f'Number of duplicate artist names: {duplicate_artists}\nNumber of duplicate artist IDs: {duplicate_artist_ids}')

In [None]:
# use hashing to encode artist names
df["artist_id"] = df["artist_name"].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
df.head()

In [None]:
df.to_csv("../../data/o2_data.csv", index=False)

In [None]:
class MusicPreprocessor:
    def __init__(self, handle_outliers="robust"):
        """
        Initialize the preprocessor

        Parameters:
        handle_outliers (str): Strategy for handling outliers
            'robust': Use RobustScaler
            'cap': Use IQR-based capping
            'none': Leave outliers as is
        """
        self.handle_outliers = handle_outliers
        self.scalers = {}

    def _cap_outliers(self, series):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return np.clip(series, lower_bound, upper_bound)

    def _handle_time_signature_outliers(self, df):
        # Example: Filter out time signatures that are not in the expected range
        expected_time_signatures = [3, 4]
        df = df[df["time_signature"].isin(expected_time_signatures)]
        return df

    def fit_transform(self, df):
        """Preprocess the music dataset"""
        df_processed = df.copy()

        # 1. Handle temporal features
        if "release_year" in df_processed.columns:
            df_processed["music_age"] = 2024 - df_processed["release_year"]

        # 2. Handle plays (use log transformation due to heavy skew)
        # if "plays" in df_processed.columns:
        #     df_processed["plays_log"] = np.log1p(df_processed["plays"])

        # 3. Handle time signature outliers
        df_processed = self._handle_time_signature_outliers(df_processed)

        # 4. Process audio features
        audio_features = [
            "acousticness",
            "danceability",
            "energy",
            "instrumentalness",
            "liveness",
            "loudness",
            "speechiness",
            "valence",
            "tempo",
        ]

        for feature in audio_features:
            if feature in df_processed.columns:
                if self.handle_outliers == "cap":
                    df_processed[feature] = self._cap_outliers(df_processed[feature])
                elif self.handle_outliers == "robust":
                    self.scalers[feature] = RobustScaler()
                    df_processed[feature] = self.scalers[feature].fit_transform(
                        df_processed[feature].values.reshape(-1, 1)
                    )

        # 5. Create interaction features
        df_processed["energy_loudness"] = (
            df_processed["energy"] * df_processed["loudness"]
        )
        df_processed["dance_valence"] = (
            df_processed["danceability"] * df_processed["valence"]
        )

        # 6. Drop highly correlated features to reduce multicollinearity
        # (optional, depending on your model type)
        high_corr_features = ["loudness"]  # Since it's highly correlated with energy
        df_processed = df_processed.drop(high_corr_features, axis=1, errors="ignore")

        return df_processed

    def transform(self, df):
        """Transform new data using fitted preprocessor"""
        df_processed = df.copy()

        if "release_year" in df_processed.columns:
            df_processed["music_age"] = 2024 - df_processed["release_year"]

        # if "plays" in df_processed.columns:
        #     df_processed["plays_log"] = np.log1p(df_processed["plays"])

        for feature, scaler in self.scalers.items():
            if feature in df_processed.columns:
                df_processed[feature] = scaler.transform(
                    df_processed[feature].values.reshape(-1, 1)
                )

        df_processed["energy_loudness"] = (
            df_processed["energy"] * df_processed["loudness"]
        )
        df_processed["dance_valence"] = (
            df_processed["danceability"] * df_processed["valence"]
        )

        return df_processed.drop(["loudness"], axis=1, errors="ignore")


# Assuming df is already defined and contains the cleaned data from previous steps

# Initialize the preprocessor with 'robust' outlier handling
preprocessor = MusicPreprocessor(handle_outliers="robust")

# Fit and transform the data
df_processed = preprocessor.fit_transform(df)

# Show the processed data
print(df_processed.head())

# Visualize the distribution before and after transformation

# Original 'plays' distribution
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# sns.histplot(df["plays"], bins=10, kde=True)
# plt.title("Original Plays Distribution")

# Transformed 'plays_log' distribution
# plt.subplot(1, 2, 2)
# sns.histplot(df_processed["plays_log"], bins=10, kde=True)
# plt.title("Transformed Plays Log Distribution")

plt.tight_layout()
plt.show()

# Original 'energy' distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df["energy"], bins=10, kde=True)
plt.title("Original Energy Distribution")

# Transformed 'energy' distribution
plt.subplot(1, 2, 2)
sns.histplot(df_processed["energy"], bins=10, kde=True)
plt.title("Transformed Energy Distribution")

plt.tight_layout()
plt.show()

# Visualize the distribution of time_signature before and after transformation
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(x="time_signature", data=df)
plt.title("Original Time Signature Distribution")

plt.subplot(1, 2, 2)
sns.countplot(x="time_signature", data=df_processed)
plt.title("Processed Time Signature Distribution")

plt.tight_layout()
plt.show()

In [None]:
"synthetic_playcount" in df.columns

In [None]:
df.shape

In [None]:
# use robust scaling on "plays" and plot for visualization using seaborn
scaler = RobustScaler()
df["plays_scaled"] = scaler.fit_transform(df[["synthetic_playcount"]])
sns.histplot(df["plays_scaled"], bins=10, kde=True)
plt.title("Robust Scaled Plays Distribution")

In [None]:
df_processed["plays_log"]
sns.histplot(df_processed["plays_log"], bins=10, kde=True)
plt.title("Log Transformed Plays Distribution")

In [None]:
df_processed.shape

In [None]:
df = df_processed.copy()

In [None]:
# # handle missing values in 'genre' and 'featured_artists'
# df_processed[""].replace("Unknown", "Other", inplace=True)
# df_processed["featured_artists"].fillna("None", inplace=True)

In [None]:
# # make datframe of all unknown values in 'genre' and 'featured_artists' along with the music
# unknown_genre = df_processed[df_processed["genre"] == "Other"]
# unknown_artists = df_processed[df_processed["featured_artists"] == "NaN"]
# unknown_artists
# # unknown_genre

In [None]:
df_processed.columns

In [None]:
# confirm that artist names and artist ids are unique
df_processed["artist_id"].nunique(), df_processed["artist_name"].nunique()
print(
    f'Number of unique artists: {df_processed["artist_id"].nunique()}\nNumber of unique artist names: {df_processed["artist_name"].nunique()}'
)

In [None]:
'main_genre' in df.columns

In [None]:
df = pd.read_csv("../../data/o2_data.csv")

In [None]:
df["main_genre"].value_counts()

In [None]:
df.columns

In [None]:
df.head()