# Data Cleaning Pipeline

In [1]:
import pandas as pd, numpy as np, re, ast, logging
from typing import List
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
log = logging.getLogger("clean")
RAW_PATH   = "dataset.csv"
CLEAN_PATH = "cleaned-dataset.csv"

## 1  Load raw dataset & inspect

In [2]:
df = pd.read_csv(RAW_PATH)
display(df.head(3)); display(df.info())

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,-2022,/title/tt1745960/
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,-2022,/title/tt8041270/
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,-1986,/title/tt0092099/


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24402 entries, 0 to 24401
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie title   24402 non-null  object
 1   Run Time      24402 non-null  object
 2   Rating        24402 non-null  object
 3   User Rating   24402 non-null  object
 4   Generes       24402 non-null  object
 5   Overview      24158 non-null  object
 6   Plot Kyeword  24402 non-null  object
 7   Director      24402 non-null  object
 8   Top 5 Casts   24402 non-null  object
 9   Writer        24402 non-null  object
 10  year          23624 non-null  object
 11  path          24402 non-null  object
dtypes: object(12)
memory usage: 2.2+ MB


None

## 2  Column hygiene – snake_case + typo fixes

In [3]:
df.columns = (df.columns
                .str.strip()
                .str.lower()
                .str.replace(" ", "_"))
df = df.rename(columns={"plot_kyeword": "plot_keyword",
                        "generes": "genres"})

## 3  Numeric parsing – `rating` & `votes`

In [4]:
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
def parse_votes(v):
    if pd.isna(v): return None
    if isinstance(v, (int, float)): return int(v)
    v = (str(v).strip().upper()
                    .replace(",", "")
                    .replace("M", "000000")
                    .replace("K", "000"))
    try: return int(float(v))
    except ValueError: return None
df["votes"] = df["user_rating"].apply(parse_votes)

## 4 Runtime & Budget extraction (future‑proofing!)

Even if the first version of this project **doesn't use runtime or budget**,  
capturing them now costs almost nothing and prevents re‑processing when someone asks:  
* "Filter movies under 100 min"  
* "Random film that cost > \$100 M"  

In [5]:
run_re = re.compile(r"(?:(\d+)\s*hours?)?\s*(?:(\d+)\s*minutes?)?", re.I)
def runtime_min(txt):
    if pd.isna(txt): return None
    m = run_re.search(str(txt))
    if m and (m.group(1) or m.group(2)):
        h = int(m.group(1) or 0); m_ = int(m.group(2) or 0)
        return h*60 + m_
    try: return int(float(str(txt).replace(",", "")))
    except ValueError: return None
def budget_usd(txt):
    if pd.isna(txt): return None
    m = re.search(r"\$([\d,]+)", str(txt))
    return int(m.group(1).replace(",", "")) if m else None
df["runtime_min"] = df["run_time"].apply(runtime_min)
df["budget_usd"]  = df["run_time"].apply(budget_usd)

## 5  Fix `year` and convert stringified lists

In [6]:
df["year"] = (pd.to_numeric(df["year"].astype(str)
                                         .str.replace(r"[^0-9]", "", regex=True),
                            errors="coerce")
               .astype("Int64"))
def safe_eval(x): 
    try: return ast.literal_eval(x) if pd.notna(x) else []
    except Exception: return []
for col in ["genres", "plot_keyword", "top_5_casts"]:
    df[col] = df[col].apply(safe_eval)

## 6  Clean actor names in top_5_casts

In [7]:
def clean_actor_lists(actor_list):
    """Clean each actor name and filter out empty ones"""
    if not actor_list:
        return []
    
    # Clean each actor name
    cleaned = []
    for actor in actor_list:
        if not actor or pd.isna(actor):
            continue
        
        # Strip whitespace and handle any other needed cleaning
        actor = str(actor).strip()
        if actor:  # Only include non-empty strings
            cleaned.append(actor)
    
    return cleaned

# Apply the cleaning to top_5_casts
df["top_5_casts"] = df["top_5_casts"].apply(clean_actor_lists)

## 7  Drop dirty columns & de‑duplicate

In [8]:
df = df.drop(columns={"run_time", "overview", "path"} & set(df.columns))
df = df.drop_duplicates(subset=["movie_title", "year"], keep="first")
log.info("Rows after cleaning: %d", len(df))

INFO | Rows after cleaning: 23922


## 8  Save cleaned dataset

In [9]:
df.to_csv(CLEAN_PATH, index=False)
log.info("Saved %s", CLEAN_PATH)

INFO | Saved cleaned-dataset.csv
