Importing the user data history datadet

In [3]:
pip install zstandard

Collecting zstandardNote: you may need to restart the kernel to use updated packages.

  Downloading zstandard-0.23.0-cp311-cp311-win_amd64.whl.metadata (3.0 kB)
Downloading zstandard-0.23.0-cp311-cp311-win_amd64.whl (495 kB)
   ---------------------------------------- 0.0/495.4 kB ? eta -:--:--
    --------------------------------------- 10.2/495.4 kB ? eta -:--:--
   -- ------------------------------------ 30.7/495.4 kB 435.7 kB/s eta 0:00:02
   ------- ------------------------------- 92.2/495.4 kB 871.5 kB/s eta 0:00:01
   ---------------------------------------  491.5/495.4 kB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 495.4/495.4 kB 2.8 MB/s eta 0:00:00
Installing collected packages: zstandard
Successfully installed zstandard-0.23.0



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\Gartb\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import os
import sqlite3
import tarfile
import zstandard as zstd
import pandas as pd

# Define Paths
TAR_FILE = "C:\\Users\\Taylor\\mlhdplus-partial-0.tar"  # Change this to your actual .tar file
EXTRACT_PATH = "./mlhd_extracted"  # Folder where files will be extracted
DB_FILE = "mlhd_user_history.db"
TABLE_NAME = "user_history"

# Create SQLite table if it doesn't exist
def create_table():
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
            timestamp INTEGER,
            artist_mbid TEXT,
            release_mbid TEXT NULL,
            recording_mbid TEXT NULL
        );
    """)
    conn.commit()
    conn.close()

# Extract .tar archive
def extract_tar():
    print(f"\U0001F4E6 Extracting {TAR_FILE} to {EXTRACT_PATH}...")
    with tarfile.open(TAR_FILE, "r") as tar:
        tar.extractall(EXTRACT_PATH)
    print("✅ Extraction complete!")

# Function to read and process a single .zst file
def read_mlhd_file(filepath):
    with open(filepath, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            text = reader.read().decode("utf-8")  # Decompress and read file
            lines = text.strip().split("\n")  # Split into rows
            data = [line.split("\t") for line in lines]  # Split tab-separated values

    # Ensure every row has exactly 4 columns (fill missing ones with None)
    for i in range(len(data)):
        while len(data[i]) < 4:  # If a row is missing columns, add NULL values
            data[i].append(None)

    # Convert to DataFrame with correct column names
    df = pd.DataFrame(data, columns=["timestamp", "artist_mbid", "release_mbid", "recording_mbid"])

    # Convert timestamp to integer and handle missing values
    df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")
    df["artist_mbid"] = df["artist_mbid"].replace("", None)
    df["release_mbid"] = df["release_mbid"].replace("", None)
    df["recording_mbid"] = df["recording_mbid"].replace("", None)
    
    return df

# Function to insert data into SQLite
def insert_data(df):
    conn = sqlite3.connect(DB_FILE)
    df.to_sql(TABLE_NAME, conn, if_exists="append", index=False)
    conn.commit()
    conn.close()

# Process all extracted .zst files
def process_mlhd_folder():
    for root, _, files in os.walk(EXTRACT_PATH):
        for file in files:
            if file.endswith(".zst"):  # Only process .zst files
                filepath = os.path.join(root, file)
                print(f"\U0001F4C2 Processing file: {filepath}")
                df = read_mlhd_file(filepath)  # Read file
                insert_data(df)  # Insert into SQLite

# MAIN EXECUTION
if __name__ == "__main__":
    create_table()  # Ensure the table is created
    extract_tar()  # Extract the .tar file first
    process_mlhd_folder()  # Process all extracted .zst files
    print("✅ All MLHD files imported successfully!")

📦 Extracting C:\Users\Taylor\mlhdplus-partial-0.tar to ./mlhd_extracted...
✅ Extraction complete!
📂 Processing file: ./mlhd_extracted\00\0000119c-0f93-4e28-894b-14d32aa2ed2e.txt.zst
📂 Processing file: ./mlhd_extracted\00\00003c83-a3a0-4720-821a-1f34890d21c7.txt.zst
📂 Processing file: ./mlhd_extracted\00\00004b7e-33e5-43d2-a1e3-a089459a23a9.txt.zst
📂 Processing file: ./mlhd_extracted\00\00006438-6f47-48f6-a41f-09f335502969.txt.zst
📂 Processing file: ./mlhd_extracted\00\000074e8-72eb-48f3-9c8e-ea55b306ebb4.txt.zst
📂 Processing file: ./mlhd_extracted\00\00008118-11c8-4d45-b112-a4c9ab7d823b.txt.zst
📂 Processing file: ./mlhd_extracted\00\0000a92b-f14b-4f08-b175-3d456ffe0d65.txt.zst
📂 Processing file: ./mlhd_extracted\00\0000c6cd-c8c2-4d83-8243-5d0a8d63f5cb.txt.zst
📂 Processing file: ./mlhd_extracted\00\0000c8a0-7f76-4c7a-8602-0805dad51315.txt.zst
📂 Processing file: ./mlhd_extracted\00\0000ef9e-b9ea-4a08-b164-0a6510afdfe3.txt.zst
📂 Processing file: ./mlhd_extracted\00\00010029-b8aa-46d8-82cd

Testing to see if the data was imported correctly


In [None]:
import sqlite3
import pandas as pd

DB_FILE = "mlhd_user_history.db"  # Database file

def query_data(query, limit=10):
    conn = sqlite3.connect(DB_FILE)
    df = pd.read_sql_query(query, conn)  # Execute SQL query
    conn.close()
    
    # Display results
    print(f"🔍 Query Results (Showing {limit} rows):")
    print(df.head(limit))  # Show first 'limit' rows

    return df

# Example: Fetch first 10 records
query_data("SELECT * FROM user_history LIMIT 10;")



🔍 Query Results (Showing 10 rows):
    timestamp                           artist_mbid  \
0  1368202881  75167b8b-44e4-407b-9d35-effe87b223cf   
1  1368201449  b822008d-f2eb-46fe-8ec4-3ad7499a6062   
2  1368200652  e4d7cfe5-0bed-46cf-acad-ab9a4dcb7aa6   
3  1368200543  e4d7cfe5-0bed-46cf-acad-ab9a4dcb7aa6   
4  1368197632                                  None   
5  1365521481  bcabb743-60ed-406b-94da-9bf82e032e58   
6  1365521085                                  None   
7  1365520881  7f3ab0d9-6fff-48e3-be28-6119e6bbf8a0   
8  1365520689  7b36bd95-064e-4501-8027-5bcdb9f4c0d3   
9  1365520492                                  None   

                           release_mbid recording_mbid  
0  03754f6b-bc0e-4ddd-8c78-82287811aa7f           None  
1  852303fd-7651-451c-96f8-5193122b8490           None  
2                                  None           None  
3                                  None           None  
4                                  None           None  
5                

Unnamed: 0,timestamp,artist_mbid,release_mbid,recording_mbid
0,1368202881,75167b8b-44e4-407b-9d35-effe87b223cf,03754f6b-bc0e-4ddd-8c78-82287811aa7f,
1,1368201449,b822008d-f2eb-46fe-8ec4-3ad7499a6062,852303fd-7651-451c-96f8-5193122b8490,
2,1368200652,e4d7cfe5-0bed-46cf-acad-ab9a4dcb7aa6,,
3,1368200543,e4d7cfe5-0bed-46cf-acad-ab9a4dcb7aa6,,
4,1368197632,,,
5,1365521481,bcabb743-60ed-406b-94da-9bf82e032e58,,
6,1365521085,,,
7,1365520881,7f3ab0d9-6fff-48e3-be28-6119e6bbf8a0,,
8,1365520689,7b36bd95-064e-4501-8027-5bcdb9f4c0d3,,
9,1365520492,,,


In [4]:
import sqlite3
import pandas as pd

DB_FILE = "mlhd_user_history.db"  # Database file

def query_data(query, limit=10):
    conn = sqlite3.connect(DB_FILE)
    df = pd.read_sql_query(query, conn)  # Execute SQL query
    conn.close()
    
    # Display results
    print(f"🔍 Query Results (Showing {limit} rows):")
    print(df.head(limit))  # Show first 'limit' rows

    return df

# Example: Fetch first 10 records
query_data("SELECT count(*) FROM user_history;")


🔍 Query Results (Showing 10 rows):
    count(*)
0  396472658


Unnamed: 0,count(*)
0,396472658
