In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
IN_CSV = "cleaned_movies.csv"                    # input file
OUT_CSV = "movies_with_numeric_score.csv"        # output file
NUMERIC_COLS = [
    'vote_average',
    'vote_count',
    'revenue',
    'runtime',
    'budget',
    'popularity',
    'imdb_rating',
    'imdb_votes'
]
LOG_COLS = ['vote_count', 'revenue', 'budget', 'imdb_votes', 'popularity']
KEEP_COLS = ['vote_average', 'runtime', 'imdb_rating']
TRANSFORMED_COLS = KEEP_COLS + [c + '_log' for c in LOG_COLS]

In [3]:
DEFAULT_WEIGHTS = {
    'vote_average': 0.18,
    'imdb_rating': 0.22,
    'runtime': 0.03,
    'vote_count_log': 0.15,
    'imdb_votes_log': 0.15,
    'popularity_log': 0.12,
    'revenue_log': 0.08,
    'budget_log': 0.07
}

In [4]:
def compute_numeric_score(df, weights=None, transformed_cols=TRANSFORMED_COLS):
    """Compute numeric_score for a dataframe and return augmented df.

    Steps:
    - Ensure numeric columns exist; fill missing with median
    - Log1p transform skewed columns into *_log columns
    - Scale TRANSFORMED_COLS to [0,1]
    - Compute weighted sum according to weights (normalized)
    - Final numeric_score normalized to [0,1]
    """
    df = df.copy()

    # Ensure columns exist; if missing create with zeros
    for c in NUMERIC_COLS:
        if c not in df.columns:
            df[c] = 0.0

    # 1) Fill numeric NaNs with median
    for c in NUMERIC_COLS:
        try:
            median_val = df[c].median(skipna=True)
        except Exception:
            median_val = 0.0
        df[c] = df[c].fillna(median_val)

    # 2) Log transform skewed cols
    for c in LOG_COLS:
        # clip negatives (just in case) then log1p
        safe = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(float).clip(lower=0)
        df[c + '_log'] = np.log1p(safe)

    # 3) Ensure keep cols are numeric
    for c in KEEP_COLS:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(float)

    # 4) Ensure TRANSFORMED_COLS exist
    for col in transformed_cols:
        if col not in df.columns:
            df[col] = 0.0

    # 5) Scale transformed features to [0,1]
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[transformed_cols])
    scaled_df = pd.DataFrame(scaled, columns=[f's_{c}' for c in transformed_cols], index=df.index)
    df = pd.concat([df, scaled_df], axis=1)

    # 6) Normalize weights
    if weights is None:
        weights = DEFAULT_WEIGHTS.copy()
    else:
        weights = weights.copy()

    # Ensure keys exist: if a weight references missing col, it will be skipped
    total_w = sum(weights.values()) if sum(weights.values()) > 0 else 1.0
    weights = {k: v / total_w for k, v in weights.items()}

    # 7) Compute weighted sum over scaled columns
    score = np.zeros(len(df), dtype=float)
    for orig_col, wt in weights.items():
        scaled_col = 's_' + orig_col
        if scaled_col in df.columns:
            score += df[scaled_col].values * wt
        else:
            # missing weight column - skip
            print(f"[numeric_score] Warning: scaled column {scaled_col} not found; skipping weight {orig_col}")

    df['numeric_score'] = score

    # 8) Final normalization to [0,1]
    minv = df['numeric_score'].min()
    maxv = df['numeric_score'].max()
    if maxv - minv > 0:
        df['numeric_score'] = (df['numeric_score'] - minv) / (maxv - minv)
    else:
        df['numeric_score'] = 0.0

    return df

In [5]:
if __name__ == '__main__':
    if not os.path.exists(IN_CSV):
        raise FileNotFoundError(f"Input file not found: {IN_CSV} - put cleaned_movies.csv in the working dir")

    print("Loading cleaned data from:", IN_CSV)
    df = pd.read_csv(IN_CSV)
    print("Rows:", len(df))

    # Compute score
    df_out = compute_numeric_score(df)

    # Diagnostics
    print('\nNumeric score summary:')
    print(df_out['numeric_score'].describe())

    if 'title' in df_out.columns:
        print('\nTop 15 movies by numeric_score:')
        print(df_out[['id','title','numeric_score']].sort_values('numeric_score', ascending=False).head(15).to_string(index=False))

    # Save augmented CSV
    df_out.to_csv(OUT_CSV, index=False)
    print(f'\nSaved augmented dataframe to: {OUT_CSV}')


Loading cleaned data from: cleaned_movies.csv
Rows: 29937

Numeric score summary:
count    29937.000000
mean         0.448353
std          0.165903
min          0.000000
25%          0.326596
50%          0.420550
75%          0.564214
max          1.000000
Name: numeric_score, dtype: float64

Top 15 movies by numeric_score:
    id                                             title  numeric_score
   155                                   The Dark Knight       1.000000
157336                                      Interstellar       0.998608
   278                          The Shawshank Redemption       0.988171
 27205                                         Inception       0.986137
   122     The Lord of the Rings: The Return of the King       0.985217
   120 The Lord of the Rings: The Fellowship of the Ring       0.981564
   238                                     The Godfather       0.980437
   550                                        Fight Club       0.972336
    13                   