# MovieLens

In [None]:
! pip install kagglehub
import kagglehub
import shutil
import pandas as pd
import numpy as np
import os
from scipy.special import expit

In [None]:
# seed
seed = 0
rng = np.random.default_rng(seed)

#### download data

In [None]:
# download from kaggle
src_dir = kagglehub.dataset_download("odedgolden/movielens-1m-dataset")
data_dir = "./source_data/MovieLens"
os.makedirs(data_dir, exist_ok=True)
shutil.copytree(src_dir, data_dir, dirs_exist_ok=True)

#### read data

In [None]:
ratings = pd.read_csv(
    os.path.join(data_dir, "ratings.dat"),
    sep="::",
    header=None,
    engine="python",
    names=["user_id", "movie_id", "rating", "timestamp"])

In [None]:
users = pd.read_csv(
    os.path.join(data_dir, "users.dat"),
    sep="::",
    header=None,
    engine="python",
    names=["user_id", "gender", "age", "occupation", "zip"])

In [None]:
movies = pd.read_csv(
    os.path.join(data_dir, "movies.dat"),
    sep="::",
    header=None,
    engine="python",
    names=["movie_id", "title", "genres"],
    encoding="ISO-8859-1")

#### process features

In [None]:
# binarize gender (1 male, 0 female)
users["gender"] = users["gender"].map({"M": 1, "F": 0})

# indicators for most common occupations
top_occ = users["occupation"].value_counts().index[:4]
if 0 in top_occ:
    top_occ = [o for o in top_occ if o != 0]

occ_labels = {
    1: "academic",
    4: "student",
    7: "manager"}

for occ_id in top_occ:
    label = occ_labels.get(occ_id, f"occ_{occ_id}")
    users[f"{label}"] = (users["occupation"] == occ_id).astype(int)
users["other_occupation"] = (~users["occupation"].isin(top_occ)).astype(int)

# store features
demo_cols = ["user_id","gender","age","student","manager","academic","other_occupation"]
demo_features = users[demo_cols].copy()

In [None]:
# rating statistics
rating_stats = ratings.groupby("user_id").agg(
    mean_rating=("rating", "mean"),
    std_rating=("rating", "std")).reset_index()
rating_stats["std_rating"] = rating_stats["std_rating"].fillna(0.0)

# store features
behav_features = rating_stats

In [None]:
# genres
movies["genre_list"] = movies["genres"].str.split("|")
all_genres = sorted({g for gl in movies["genre_list"] for g in gl})

# count
genre_counts = {}
for glist in movies["genre_list"]:
    for g in glist:
        genre_counts[g] = genre_counts.get(g, 0) + 1
top_3_genres = sorted(genre_counts, key=genre_counts.get, reverse=True)[:3]

# merge
ratings_with_genres = ratings.merge(
    movies[["movie_id", "genre_list"]], on="movie_id", how="left")

# add indicator
for g in top_3_genres:
    ratings_with_genres[g] = ratings_with_genres["genre_list"].apply(
        lambda gl: 1 if g in gl else 0)

# fractions
genre_sums = ratings_with_genres.groupby("user_id")[top_3_genres].sum()
user_counts = ratings.groupby("user_id")["movie_id"].count()
genre_share = genre_sums.div(user_counts, axis=0).reset_index()
genre_share.columns = ["user_id"] + [f"share_{g}" for g in top_3_genres]
genre_share.columns = ['user_id', 'share_drama', 'share_comedy', 'share_action']

In [None]:
# merge
df = (demo_features.merge(behav_features, on="user_id").merge(genre_share, on="user_id"))

# sample
df = df.drop('user_id', axis=1)
df = df.sample(6000, random_state=seed).reset_index(drop=True)

In [None]:
# store features
df.to_csv("./movielens_tmp.csv")

#### generate treatments and outcome

In [None]:
# read features
df = pd.read_csv("./movielens_tmp.csv", index_col=0)

In [None]:
# scale features
cont_cols = ['age','mean_rating','std_rating','share_drama','share_comedy','share_action']
bin_cols = ['gender','student','manager','academic','other_occupation']
df[cont_cols] = (df[cont_cols] - df[cont_cols].mean()) / df[cont_cols].std()

In [None]:
# copy for data generation
gen_df = df.copy()

In [None]:
# s(x)
gen_df['s'] = (
      0.7 * gen_df['share_drama']                       
    + 0.2 * (gen_df['share_drama']**2 - 1)              
    + 0.7 * gen_df['share_comedy']                      
    - 0.2 * gen_df['share_action']                      
    + 0.3 * (-gen_df['age'])                            
    + 0.3 * (-gen_df['age']) * gen_df['share_comedy'])  

In [None]:
# v(x)
gen_df['v'] = (
  - 0.5*gen_df['manager']     
  + 0.2*gen_df['academic']    
  + 0.7*gen_df['student'])    

In [None]:
# e(x)
logit_e = (0.75 * gen_df['v'] + 0.25 * gen_df['s'])       
gen_df['e'] = expit(logit_e)                              
gen_df['T'] = np.random.binomial(1, gen_df['e'])          

In [None]:
# mu_0(x)
gen_df['M0'] = (
    2 * gen_df['mean_rating']                                       
    + 0.5 * np.tanh(1.5 * gen_df['std_rating'])                     
    + 0.25 * gen_df['mean_rating'] * np.tanh(gen_df['std_rating'])) 

In [None]:
# tau(x)
gen_df['cate'] = 2 * expit(1.5 * gen_df['s'] - 0.25) + 0.75 * gen_df['s']

In [None]:
# mu_1(x)
gen_df['M1'] = gen_df['M0'] + gen_df['cate']

In [None]:
# y(x)
sigma_y = 0.6
gen_df['Y0'] = gen_df['M0'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y1'] = gen_df['M1'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y'] = np.where(gen_df['T']==1, gen_df['Y1'], gen_df['Y0'])

In [None]:
# set variables 
df["T"] = gen_df["T"]
df["M0"] = gen_df["M0"]
df["M1"] = gen_df["M1"]
df["cate"] = gen_df["cate"]
df["Y"] = gen_df["Y"]
df["e"] = gen_df["e"]
df["s"] = gen_df["s"]

In [None]:
# store
df.to_csv("./movielens.csv")