In [1]:
import json
import pandas as pd

In [2]:
# Load JSON file
file_path = "../uncleaned_data/Streaming_History_Audio_2015-2016_0.json"  # Update path if needed
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [3]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [4]:
# Convert ISO 8601 timestamp to datetime
df["date"] = pd.to_datetime(df["ts"], errors="coerce", utc=True)

In [5]:
# Extract only 2015 data
df_2015 = df[df["date"].dt.year == 2015]

In [6]:
# Keep only the required columns
required_columns = [
    "ts", "ms_played", "master_metadata_track_name", 
    "master_metadata_album_artist_name", "master_metadata_album_album_name"
]

df_2015 = df_2015[required_columns] 
df_2015 = df_2015.dropna(how="all") 

In [7]:
# Check if data exists
print(df_2015.shape)

(11013, 5)


In [12]:
# Convert timestamp format
df_2015["ts"] = pd.to_datetime(df_2015["ts"]).dt.strftime("%Y-%m-%d %H:%M:%S")

In [13]:
# Remove quotes from column headers
df_2015.columns = df_2015.columns.str.replace('"', '')

In [14]:
# Drop the "date" column
df = df.drop(columns=["date"], errors="ignore")

In [15]:
# Save the filtered data to JSON
df_2015.to_json("../cleaned_data/json/Streaming_History_2015.json", orient="records", indent=4)

In [18]:
# Save the filtered data to CSV
df_2015.to_csv("../cleaned_data/csv/spotify_2015_fixed.csv", index=False, quoting=1, header=True)

In [19]:
# Manually remove quotes from headers with UTF-8 encoding
with open("../cleaned_data/csv/spotify_2015_fixed.csv", "r", encoding="utf-8") as file:
    lines = file.readlines()

In [21]:
# Replace first line (headers) without quotes
lines[0] = ",".join(lines[0].replace('"', "").strip().split(",")) + "\n"

In [22]:
# Write back to the file using UTF-8
with open("../cleaned_data/csv/spotify_2015_fixed.csv", "w", encoding="utf-8") as file:
    file.writelines(lines)