# 🧪 ETL Data Project (CSV, JSON, TXT)
DATA INGESTION AND WRANGLING

Joachim Justesen

In [None]:
# Step 1: Imports and Setup
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# Display plots inline
%matplotlib inline
pd.set_option('display.max_columns', None)

# Paths
BASE_PATH = "data"
CSV_FILE = os.path.join(BASE_PATH, "structured_data.csv")
JSON_FILE = os.path.join(BASE_PATH, "semi_structured_data.json")
TXT_FILE = os.path.join(BASE_PATH, "unstructured_data.txt")

# Confirm paths
for f in [CSV_FILE, JSON_FILE, TXT_FILE]:
    assert os.path.exists(f), f"File not found: {f}"

print(" All files found")

In [None]:
#  Loaders

def load_csv_to_df(file_path):
    return pd.read_csv(file_path)

def load_json_to_df(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.json_normalize(data)

def load_txt_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines if line.strip()]

In [None]:
#  Ingest the Data
csv_data = load_csv_to_df(CSV_FILE)
json_data = load_json_to_df(JSON_FILE)
txt_data = load_txt_to_list(TXT_FILE)

print("CSV Preview:")
display(csv_data.head())

print("JSON Preview:")
display(json_data.head())

print("TXT Preview:")
print(txt_data[:3])

In [None]:
#  Clean the Data

def explore_and_clean(df):
    print("Info:")
    display(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    return df.dropna(thresh=len(df.columns) - 1)

csv_data = explore_and_clean(csv_data)
json_data = explore_and_clean(json_data)

def clean_text(lines):
    return [re.sub(r'[^\w\s]', '', line.lower()) for line in lines]

txt_data = clean_text(txt_data)
print(txt_data[:3])

In [None]:
#  Anonymisation

def anonymise_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = 'ANONYMIZED'
    return df

csv_data = anonymise_column(csv_data, "name")
json_data = anonymise_column(json_data, "profile.email")

display(csv_data.head())
display(json_data.head())

In [None]:
#  Visualisation

def visualize_data(df, column):
    plt.figure(figsize=(8, 5))
    sns.histplot(df[column].dropna(), kde=True)
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.grid(True)
    plt.show()

numeric_cols = csv_data.select_dtypes(include=np.number).columns.tolist()
if numeric_cols:
    visualize_data(csv_data, numeric_cols[0])

In [None]:
#  Save Clean Data

csv_data.to_csv("cleaned_structured_data.csv", index=False)
json_data.to_csv("cleaned_semi_structured_data.csv", index=False)

with open("cleaned_unstructured_text.txt", "w") as f:
    for line in txt_data:
        f.write(f"{line}\n")

print(" Cleaned files saved.")