In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
IN_CSV = "cleaned_movies.csv"                    # input file
OUT_CSV = "movies_with_numeric_score.csv"        # output file
NUMERIC_COLS = [
    'vote_average',
    'vote_count',
    'revenue',
    'runtime',
    'budget',
    'popularity',
    'imdb_rating',
    'imdb_votes'
]
LOG_COLS = ['vote_count', 'revenue', 'budget', 'imdb_votes', 'popularity']
KEEP_COLS = ['vote_average', 'runtime', 'imdb_rating']
TRANSFORMED_COLS = KEEP_COLS + [c + '_log' for c in LOG_COLS]

In [3]:
DEFAULT_WEIGHTS = {
    'vote_average': 0.18,
    'imdb_rating': 0.22,
    'runtime': 0.03,
    'vote_count_log': 0.15,
    'imdb_votes_log': 0.15,
    'popularity_log': 0.12,
    'revenue_log': 0.08,
    'budget_log': 0.07
}

In [4]:
def compute_numeric_score(df,
                          numeric_cols=NUMERIC_COLS,
                          log_cols=LOG_COLS,
                          keep_cols=KEEP_COLS,
                          weights=DEFAULT_WEIGHTS,
                          out_csv=OUT_CSV):
    # 1) Ensure columns exist; if missing, create with zeros
    for c in numeric_cols:
        if c not in df.columns:
            df[c] = 0.0

    # 2) Fill numeric NaNs with median (robust)
    for c in numeric_cols:
        median_val = df[c].median(skipna=True)
        df[c] = df[c].fillna(median_val)

    # 3) Log-transform skewed cols -> create new columns like 'vote_count_log'
    for c in log_cols:
        # clip negatives just in case, then log1p
        safe = df[c].astype(float).clip(lower=0)
        df[c + '_log'] = np.log1p(safe)

    # 4) Ensure keep cols are floats
    for c in keep_cols:
        df[c] = df[c].astype(float)

    # 5) Make sure TRANSFORMED_COLS present
    for col in TRANSFORMED_COLS:
        if col not in df.columns:
            df[col] = 0.0

    # 6) Scale transformed features to [0,1]
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[TRANSFORMED_COLS])
    scaled_df = pd.DataFrame(scaled,
                             columns=[f's_{c}' for c in TRANSFORMED_COLS],
                             index=df.index)
    df = pd.concat([df, scaled_df], axis=1)

    # 7) Normalize and use weights
    # Normalize weights sum to 1
    w = weights.copy()
    total_w = sum(w.values()) if sum(w.values()) > 0 else 1.0
    w = {k: v / total_w for k, v in w.items()}

    # 8) Compute weighted sum
    score_arr = np.zeros(len(df), dtype=float)
    for original_col, wt in w.items():
        scaled_col = 's_' + original_col  # e.g. s_vote_count_log
        if scaled_col in df.columns:
            score_arr += df[scaled_col].values * wt
        else:
            # if user renamed columns or weight mismatch, skip gracefully
            print(f"Warning: {scaled_col} missing; skipping that weight.")

    df['numeric_score'] = score_arr

    # 9) Final normalization (0-1)
    minv = df['numeric_score'].min()
    maxv = df['numeric_score'].max()
    df['numeric_score'] = (df['numeric_score'] - minv) / (maxv - minv + 1e-12)

    # 10) Diagnostics
    print("Computed numeric_score. Summary:")
    print(df['numeric_score'].describe().to_string())
    print("\nTop 10 by numeric_score:")
    if 'title' in df.columns:
        print(df[['title', 'numeric_score']].sort_values('numeric_score', ascending=False).head(10))
    else:
        print(df['numeric_score'].sort_values(ascending=False).head(10))

    # 11) Save
    if out_csv:
        df.to_csv(out_csv, index=False)
        print(f"\nSaved augmented dataframe to {out_csv}")

    return df


In [5]:
if __name__ == "__main__":
    df = pd.read_csv(IN_CSV)
    df = compute_numeric_score(df)

Computed numeric_score. Summary:
count    256969.000000
mean          0.303517
std           0.128881
min           0.000000
25%           0.195795
50%           0.271205
75%           0.387057
max           1.000000

Top 10 by numeric_score:
                                                   title  numeric_score
91                                       The Dark Knight       1.000000
49890                                       Interstellar       0.999078
179                             The Shawshank Redemption       0.991680
11483                                          Inception       0.988086
70         The Lord of the Rings: The Return of the King       0.987111
68     The Lord of the Rings: The Fellowship of the Ring       0.983093
151                                        The Godfather       0.981450
333                                           Fight Club       0.976402
69                 The Lord of the Rings: The Two Towers       0.972385
6                                    