<a href="https://colab.research.google.com/github/harika373/cleaning-data-data-Analysis-/blob/main/YouTube%20Trending%20Dataset%20Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import json
from scipy import stats
import csv

# 1. Load CSV and JSON
# Try reading the CSV line by line using the csv module
data = []
try:
    with open("/content/USvideos.csv", "r", encoding='utf-8') as f:
        reader = csv.reader(f, escapechar='\\', quotechar='"')
        header = next(reader) # Read header
        data.append(header)
        for row in reader:
            data.append(row)
    df_yt = pd.DataFrame(data[1:], columns=data[0])

except Exception as e:
    print(f"Error reading CSV with csv module: {e}")
    # Fallback to pandas read_csv with relaxed parameters if csv module fails
    try:
        df_yt = pd.read_csv("/content/USvideos.csv", engine='python', quotechar='"', escapechar='\\', on_bad_lines='skip')
    except Exception as e_fallback:
        print(f"Fallback read_csv also failed: {e_fallback}")
        df_yt = pd.DataFrame() # Create empty dataframe if both fail


with open("/content/US_category_id.json", "r") as f:
    categories = json.load(f)

# Extract category mapping
cat_mapping = {}
for item in categories['items']:
    cat_mapping[int(item['id'])] = item['snippet']['title']

# 2. Data Integrity
numeric_cols = ['views', 'likes', 'dislikes', 'comment_count']
for col in numeric_cols:
    df_yt[col] = pd.to_numeric(df_yt[col], errors='coerce').fillna(0)
    df_yt = df_yt[df_yt[col] >= 0]  # no negatives allowed

df_yt['publish_time'] = pd.to_datetime(df_yt['publish_time'], errors='coerce')

# 3. Missing Data Handling
df_yt.dropna(subset=['title', 'publish_time'], inplace=True)
df_yt['tags'] = df_yt['tags'].fillna("No Tags")

# 4. Duplicate Removal
df_yt.drop_duplicates(subset=['video_id', 'trending_date'], inplace=True)

# 5. Standardization
df_yt['title'] = df_yt['title'].str.strip()
df_yt['category_id'] = df_yt['category_id'].map(cat_mapping)  # Replace ID with category name

# 6. Outlier Detection (Z-score method)
# Ensure numeric columns are actually numeric before calculating z-scores
for col in numeric_cols:
    df_yt[col] = pd.to_numeric(df_yt[col], errors='coerce')

# Drop rows where numeric columns became NaN after coercion for Z-score calculation
df_yt.dropna(subset=numeric_cols, inplace=True)

z_scores = np.abs(stats.zscore(df_yt[numeric_cols]))
df_yt = df_yt[(z_scores < 3).all(axis=1)]

# Save cleaned dataset
df_yt.to_csv("cleaned_US_youtube.csv", index=False)

print("✅ US YouTube dataset cleaned and saved as cleaned_US_youtube.csv")

✅ US YouTube dataset cleaned and saved as cleaned_US_youtube.csv
