In [1]:
import wandb
import pandas as pd

run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)


[34m[1mwandb[0m: Currently logged in as: [33mmkeadachik[0m ([33mmkeadachik-dynatrace[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import ydata_profiling

# Create profile report for original data
df_original = df.copy()
profile_original = ydata_profiling.ProfileReport(df_original)
profile_original.to_file("report_original.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:00<00:00, 80.84it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# Data cleaning - create a copy for cleaned data
df_cleaned = df_original.copy()

# Drop outliers
min_price = 10
max_price = 350
idx = df_cleaned['price'].between(min_price, max_price)
df_cleaned = df_cleaned[idx].copy()

# Convert last_review to datetime
df_cleaned['last_review'] = pd.to_datetime(df_cleaned['last_review'])

print(f"Original dataset shape: {df_original.shape}")
print(f"Cleaned dataset shape: {df_cleaned.shape}")
print(f"Removed {df_original.shape[0] - df_cleaned.shape[0]} outliers")


Original dataset shape: (20000, 16)
Cleaned dataset shape: (19001, 16)
Removed 999 outliers


In [4]:
# Create profile report for cleaned data
profile_cleaned = ydata_profiling.ProfileReport(df_cleaned)
profile_cleaned.to_file("report_cleaned.html")

print("Created profile report for cleaned data: report_cleaned.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:00<00:00, 96.35it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Created profile report for cleaned data: report_cleaned.html


In [5]:
# Upload cleaned data to W&B
import wandb

# Save cleaned data
df_cleaned.to_csv("sample_cleaned.csv", index=False)

# Create artifact for cleaned data
artifact = wandb.Artifact(
    "sample_cleaned.csv",
    type="clean_data",
    description="Cleaned sample data with outliers removed and date conversion"
)
artifact.add_file("sample_cleaned.csv")
run.log_artifact(artifact)

print(f"Cleaned dataset shape: {df_cleaned.shape}")
print(f"Price range: {df_cleaned['price'].min()} - {df_cleaned['price'].max()}")
print("Uploaded cleaned data to W&B")

Cleaned dataset shape: (19001, 16)
Price range: 10 - 350
Uploaded cleaned data to W&B


In [6]:
# Terminate the W&B run
run.finish()

print("W&B run finished successfully!")


W&B run finished successfully!
