### Imports

In [1]:
import pandas as pd 
import os
from tqdm import tqdm
import numpy as np
from typing import List, Union, Any

#### Load IMDB Movie Information 

In [2]:
imdb_dir = "./Datasets/imdb-data-cleaned"


# Dictionary to store dataframes
imdb_data = {}

# List all files in the IMDB directory
for filename in os.listdir(imdb_dir):
    if filename.endswith('.csv'):  # IMDB files are typically tab-separated
        file_path = os.path.join(imdb_dir, filename)
        # Remove the .tsv extension to create the dictionary key
        key = filename.replace('.csv', '')
        # Read the TSV file
        imdb_data[key] = pd.read_csv(file_path, sep=',', low_memory=False)
        print(f"Loaded {filename} with shape {imdb_data[key].shape}")

Loaded title.episode.csv with shape (46, 4)
Loaded title.crew.csv with shape (83955, 3)
Loaded title.basics.csv with shape (83955, 9)
Loaded title.principals.csv with shape (1523925, 6)
Loaded title.ratings.csv with shape (83955, 3)
Loaded title.basics.aka.csv with shape (135077, 9)


In [3]:
ratings = imdb_data["title.ratings"]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2133
1,tt0000003,6.4,2167
2,tt0000007,5.3,900
3,tt0000008,5.4,2278
4,tt0000010,6.8,7865
...,...,...,...
83950,tt9911196,7.4,3429
83951,tt9914192,5.3,317
83952,tt9914644,8.1,194
83953,tt9916270,5.8,1501


In [4]:
ratings["averageRating"] = ratings["averageRating"].astype(np.float32)
ratings[ "averageRating"]

0        5.7
1        6.4
2        5.3
3        5.4
4        6.8
        ... 
83950    7.4
83951    5.3
83952    8.1
83953    5.8
83954    6.4
Name: averageRating, Length: 83955, dtype: float32

In [5]:
ratings["numVotes"] = ratings["numVotes"].astype(np.float32)
ratings["numVotes"].dtype

dtype('float32')

In [6]:
movie_features = imdb_data["title.basics"].loc[:, ["tconst", "startYear", "runtimeMinutes", "genres"]]
movie_features.loc[:, "tconst_numeric"] = movie_features["tconst"].str.extract(r'(\d+)').astype(np.float32)
movie_features["tconst_numeric"]

0              1.0
1              3.0
2              7.0
3              8.0
4             10.0
           ...    
83950    9911196.0
83951    9914192.0
83952    9914644.0
83953    9916270.0
83954    9916362.0
Name: tconst_numeric, Length: 83955, dtype: float32

In [7]:
movie_features["tconst_numeric"].isna().sum()

0

In [8]:
movie_features["startYear"] = movie_features["startYear"].str.extract(r'(\d+)').astype(np.float32)  
movie_features["startYear"]

0        1894.0
1        1892.0
2        1894.0
3        1894.0
4        1895.0
          ...  
83950    2020.0
83951    2020.0
83952    2018.0
83953    2020.0
83954    2020.0
Name: startYear, Length: 83955, dtype: float32

In [9]:
movie_features["runtimeMinutes"] = movie_features["runtimeMinutes"].str.extract(r'(\d+)').astype(np.float32)  
movie_features["runtimeMinutes"]

0          1.0
1          5.0
2          1.0
3          1.0
4          1.0
         ...  
83950    103.0
83951     98.0
83952     83.0
83953     84.0
83954     92.0
Name: runtimeMinutes, Length: 83955, dtype: float32

In [10]:
crew_df = imdb_data["title.crew"][["tconst", "directors", "writers"]]
crew_df

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000003,nm0721526,\N
2,tt0000007,"nm0005690,nm0374658",\N
3,tt0000008,nm0005690,\N
4,tt0000010,nm0525910,\N
...,...,...,...
83950,tt9911196,nm0631590,"nm2063122,nm0277932,nm0495599,nm3547655"
83951,tt9914192,nm0764367,"nm0193153,nm1148346,nm12654089"
83952,tt9914644,nm2603587,\N
83953,tt9916270,nm1480867,"nm1480867,nm10538402,nm5584269"


In [11]:
import numpy as np

def splitter(element):
    # Check if the element is a valid string and contains 'nm'
    if not isinstance(element, str) or "nm" not in element:
        return np.array([], dtype=np.float32)
    
    # Split the string by comma
    parts = element.split(',')
    processed = []
    
    # Process each part: remove the 'nm' prefix and convert to float32
    for part in parts:
        part = part.strip()  # remove extra whitespace
        if part.startswith("nm"):
            part = part[2:]  # remove the 'nm' prefix
        # Skip if part is empty after stripping
        if part == "":
            continue
        try:
            processed.append(np.float32(part))
        except ValueError:
            # Handle cases where conversion fails
            continue
    
    return np.array(processed, dtype=np.float32)


In [12]:
crew_df.loc[:, 'directors'] = crew_df['directors'].apply(splitter)
crew_df['directors'].head(100)

0               [5690.0]
1             [721526.0]
2     [5690.0, 374658.0]
3               [5690.0]
4             [525910.0]
             ...        
95            [674600.0]
96               [428.0]
97               [428.0]
98               [428.0]
99               [428.0]
Name: directors, Length: 100, dtype: object

In [13]:
crew_df.loc[:, 'writers'] = crew_df['writers'].apply(splitter)
crew_df['writers'].head(100)

0                             []
1                             []
2                             []
3                             []
4                             []
                 ...            
95          [275421.0, 304098.0]
96                       [428.0]
97    [590.0, 940488.0, 51304.0]
98             [428.0, 853193.0]
99                       [428.0]
Name: writers, Length: 100, dtype: object

In [17]:
def map_ids_to_contiguous_indices(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    Maps the IDs in the specified columns of the DataFrame to contiguous indices as float32.
    
    For each cell:
      - If the cell contains a scalar, it is replaced by a np.float32 scalar.
      - If the cell contains a list or np.ndarray, it is replaced by a np.array of float32.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame (modified in-place).
        columns (List[str]): List of column names to map.
    
    Returns:
        pd.DataFrame: The modified DataFrame with specified columns mapped.
    """
    # Collect all unique IDs from all specified columns
    unique_ids = set()
    for col in columns:
        for item in df[col]:
            if isinstance(item, (list, np.ndarray)):
                unique_ids.update(item)
            else:
                unique_ids.add(item)
    
    # Create a mapping from original IDs to contiguous indices starting from 0
    id_to_idx = {orig_id: idx for idx, orig_id in enumerate(sorted(unique_ids))}
    
    # Function to apply the mapping
    def map_item(item: Any) -> Union[np.float32, np.ndarray]:
        if isinstance(item, (list, np.ndarray)):
            # Return a NumPy array of float32 if item is a list or array
            return np.array([id_to_idx[x] for x in item], dtype=np.float32)
        else:
            # Return a float32 scalar if item is a scalar
            return np.float32(id_to_idx[item])
    
    # Process each specified column
    for col in columns:
        col_values = df[col].values
        mapped_list = [map_item(x) for x in col_values]
        
        # If every element is a scalar (not a NumPy array), create a homogeneous float32 array.
        if all(not isinstance(item, np.ndarray) for item in mapped_list):
            df.loc[:, col] = np.array(mapped_list, dtype=np.float32)
        else:
            # Otherwise, store the results as an object array.
            df.loc[:, col] = np.array(mapped_list, dtype=object)
    
    return df

In [18]:
map_ids_to_contiguous_indices(movie_features, ["tconst_numeric"])
movie_features["tconst_numeric"]

0            0.0
1            1.0
2            2.0
3            3.0
4            4.0
          ...   
83950    77028.0
83951    77029.0
83952    77030.0
83953    77031.0
83954    77032.0
Name: tconst_numeric, Length: 83955, dtype: float32

In [19]:
map_ids_to_contiguous_indices(crew_df, ["directors", "writers"])
crew_df[["directors", "writers"]]

Unnamed: 0,directors,writers
0,[1410.0],[]
1,[32283.0],[]
2,"[1410.0, 17343.0]",[]
3,[1410.0],[]
4,[24212.0],[]
...,...,...
83950,[28578.0],"[57502.0, 13009.0, 22794.0, 69352.0]"
83951,[34051.0],"[9541.0, 45536.0, 88586.0]"
83952,[62658.0],[]
83953,[50527.0],"[50527.0, 87075.0, 78241.0]"


In [23]:
crew_df["directors"][0]

array([1410.], dtype=float32)

In [24]:
movie_features.loc[:, 'genres'] = movie_features['genres'].fillna('').str.split(',')
movie_features

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,tconst_numeric
0,tt0000001,1894.0,1.0,"[Documentary, Short]",0.0
1,tt0000003,1892.0,5.0,"[Animation, Comedy, Romance]",1.0
2,tt0000007,1894.0,1.0,"[Short, Sport]",2.0
3,tt0000008,1894.0,1.0,"[Documentary, Short]",3.0
4,tt0000010,1895.0,1.0,"[Documentary, Short]",4.0
...,...,...,...,...,...
83950,tt9911196,2020.0,103.0,"[Comedy, Drama]",77028.0
83951,tt9914192,2020.0,98.0,[Comedy],77029.0
83952,tt9914644,2018.0,83.0,[Documentary],77030.0
83953,tt9916270,2020.0,84.0,[Thriller],77031.0


In [25]:
movie_features = movie_features.merge(
    ratings, 
    on="tconst", 
    how="left"
)

In [26]:
print(movie_features.dtypes)
movie_features

tconst             object
startYear         float32
runtimeMinutes    float32
genres             object
tconst_numeric    float32
averageRating     float32
numVotes          float32
dtype: object


Unnamed: 0,tconst,startYear,runtimeMinutes,genres,tconst_numeric,averageRating,numVotes
0,tt0000001,1894.0,1.0,"[Documentary, Short]",0.0,5.7,2133.0
1,tt0000003,1892.0,5.0,"[Animation, Comedy, Romance]",1.0,6.4,2167.0
2,tt0000007,1894.0,1.0,"[Short, Sport]",2.0,5.3,900.0
3,tt0000008,1894.0,1.0,"[Documentary, Short]",3.0,5.4,2278.0
4,tt0000010,1895.0,1.0,"[Documentary, Short]",4.0,6.8,7865.0
...,...,...,...,...,...,...,...
83950,tt9911196,2020.0,103.0,"[Comedy, Drama]",77028.0,7.4,3429.0
83951,tt9914192,2020.0,98.0,[Comedy],77029.0,5.3,317.0
83952,tt9914644,2018.0,83.0,[Documentary],77030.0,8.1,194.0
83953,tt9916270,2020.0,84.0,[Thriller],77031.0,5.8,1501.0


### Create Dataframe with Movie Features 

In [27]:
movie_features = movie_features.merge(
    crew_df, 
    on="tconst", 
    how="left"
)

movie_features['genres'] = movie_features['genres'].fillna('').apply(lambda x: [] if x == '' else x)

print("Final dataset shape:", movie_features.shape)
print(movie_features.dtypes)
movie_features.head()

Final dataset shape: (83955, 9)
tconst             object
startYear         float32
runtimeMinutes    float32
genres             object
tconst_numeric    float32
averageRating     float32
numVotes          float32
directors          object
writers            object
dtype: object


Unnamed: 0,tconst,startYear,runtimeMinutes,genres,tconst_numeric,averageRating,numVotes,directors,writers
0,tt0000001,1894.0,1.0,"[Documentary, Short]",0.0,5.7,2133.0,[1410.0],[]
1,tt0000003,1892.0,5.0,"[Animation, Comedy, Romance]",1.0,6.4,2167.0,[32283.0],[]
2,tt0000007,1894.0,1.0,"[Short, Sport]",2.0,5.3,900.0,"[1410.0, 17343.0]",[]
3,tt0000008,1894.0,1.0,"[Documentary, Short]",3.0,5.4,2278.0,[1410.0],[]
4,tt0000010,1895.0,1.0,"[Documentary, Short]",4.0,6.8,7865.0,[24212.0],[]


### Make The Dataframe More Compact

In [28]:
movie_features["tconst"] = movie_features["tconst"].str.extract(r'(\d+)').astype(np.int32)  
movie_features["tconst"]

0              1
1              3
2              7
3              8
4             10
          ...   
83950    9911196
83951    9914192
83952    9914644
83953    9916270
83954    9916362
Name: tconst, Length: 83955, dtype: int32

In [29]:
if not os.path.exists("./Datasets/Features"):
    os.makedirs("./Datasets/Features")
movie_features.to_pickle("./Datasets/Features3.10/movie_features.pkl")

### Load User Information 

In [30]:
### Ok I want to add imdb ID to links.csv as a column
lens_links = pd.read_csv("./Datasets/ml-32m-cleaned/links.csv")
lens_ratings = pd.read_csv("./Datasets/ml-32m-cleaned/ratings.csv")


In [31]:
lens_data = lens_ratings.merge(
    lens_links[["imdbId", "movieId"]],
    on = "movieId",
    how = "inner"
              )

In [32]:
lens_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31972560 entries, 0 to 31972559
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   imdbId     int64  
dtypes: float64(1), int64(4)
memory usage: 1.2 GB


In [27]:
lens_data.drop(columns = ["movieId", "timestamp"], inplace = True)

In [28]:
lens_data["rating"] = lens_data["rating"].astype(np.float32)
lens_data["imdbId"] =lens_data["imdbId"].astype(np.int32)
lens_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31972560 entries, 0 to 31972559
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float32
 2   imdbId  int32  
dtypes: float32(1), int32(1), int64(1)
memory usage: 487.9 MB


In [29]:
pd.to_pickle(lens_data, "./Datasets/Features3.10/ratings.pkl")

In [33]:
movie_features

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,tconst_numeric,averageRating,numVotes,directors,writers
0,1,1894.0,1.0,"[Documentary, Short]",0.0,5.7,2133.0,[1410.0],[]
1,3,1892.0,5.0,"[Animation, Comedy, Romance]",1.0,6.4,2167.0,[32283.0],[]
2,7,1894.0,1.0,"[Short, Sport]",2.0,5.3,900.0,"[1410.0, 17343.0]",[]
3,8,1894.0,1.0,"[Documentary, Short]",3.0,5.4,2278.0,[1410.0],[]
4,10,1895.0,1.0,"[Documentary, Short]",4.0,6.8,7865.0,[24212.0],[]
...,...,...,...,...,...,...,...,...,...
83950,9911196,2020.0,103.0,"[Comedy, Drama]",77028.0,7.4,3429.0,[28578.0],"[57502.0, 13009.0, 22794.0, 69352.0]"
83951,9914192,2020.0,98.0,[Comedy],77029.0,5.3,317.0,[34051.0],"[9541.0, 45536.0, 88586.0]"
83952,9914644,2018.0,83.0,[Documentary],77030.0,8.1,194.0,[62658.0],[]
83953,9916270,2020.0,84.0,[Thriller],77031.0,5.8,1501.0,[50527.0],"[50527.0, 87075.0, 78241.0]"


In [30]:
movie_features

Unnamed: 0,tconst,startYear,runtimeMinutes,genres,tconst_numeric,averageRating,numVotes,directors,writers
0,1,1894.0,1.0,"[Documentary, Short]",0.0,5.7,2133.0,[1410],[]
1,3,1892.0,5.0,"[Animation, Comedy, Romance]",1.0,6.4,2167.0,[32283],[]
2,7,1894.0,1.0,"[Short, Sport]",2.0,5.3,900.0,"[1410, 17343]",[]
3,8,1894.0,1.0,"[Documentary, Short]",3.0,5.4,2278.0,[1410],[]
4,10,1895.0,1.0,"[Documentary, Short]",4.0,6.8,7865.0,[24212],[]
...,...,...,...,...,...,...,...,...,...
83950,9911196,2020.0,103.0,"[Comedy, Drama]",77028.0,7.4,3429.0,[28578],"[57502, 13009, 22794, 69352]"
83951,9914192,2020.0,98.0,[Comedy],77029.0,5.3,317.0,[34051],"[9541, 45536, 88586]"
83952,9914644,2018.0,83.0,[Documentary],77030.0,8.1,194.0,[62658],[]
83953,9916270,2020.0,84.0,[Thriller],77031.0,5.8,1501.0,[50527],"[50527, 87075, 78241]"


In [40]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def create_enhanced_user_profiles(ratings_df, movie_features):
    """
    Create user profiles by rating buckets, with summary statistics from `movie_features`.
    Includes:
      - Basic numeric stats (year/runtime/rating/etc.)
      - Proportion of genres
      - Top directors/writers lists (stored as float32 NumPy arrays)
    
    All computed integer values are converted to int32 and float values to float32.
    """
    # --------------------------------------------------------------------------
    # 1. Copy dataframes to avoid modifying the originals
    # --------------------------------------------------------------------------
    #movie_features = movie_features.copy()
    #ratings_df = ratings_df.copy()
    
    movie_features['startYear'] = pd.to_numeric(movie_features['startYear'], errors='coerce')
    movie_features['runtimeMinutes'] = pd.to_numeric(movie_features['runtimeMinutes'], errors='coerce')

    # --------------------------------------------------------------------------
    # 3. Helper function to gather bucket-level stats
    # --------------------------------------------------------------------------
    def get_bucket_stats(movies_df):
        """Given a subset of movie_features, return relevant statistics."""
        if len(movies_df) == 0:
            return {}
        
        stats = {}
        # A. Basic movie IDs and count
        # Assuming tconst is numeric; cast to int32.
        stats['movie_ids'] = np.array(movies_df['tconst'], dtype=np.int32)
        stats['count'] = np.int32(len(movies_df))
        
        # B. Numeric features
        numeric_features = {
            'year_avg': 'startYear',
            'runtime_avg': 'runtimeMinutes',
            'rating_avg': 'averageRating',
            'numvotes_avg': 'numVotes',
        }
        
        for stat_name, column in numeric_features.items():
            if column not in movies_df.columns:
                continue
            mean_val = movies_df[column].mean(skipna=True)
            stats[stat_name] = np.float32(mean_val) if pd.notnull(mean_val) else np.float32(0)
        
        # C. Genre proportions
        all_genres = []
        for genre_list in movies_df['genres']:
            if isinstance(genre_list, list):
                valid_genres = [g.strip() for g in genre_list if g and g != '\\N']
                all_genres.extend(valid_genres)
        if all_genres:
            genre_counts = pd.Series(all_genres).value_counts()
            genre_proportions = genre_counts / genre_counts.sum()
            # Each proportion is converted to float32.
            stats['genre_proportions'] = {k: np.float32(v) for k, v in genre_proportions.to_dict().items()}
        
        # D. Directors and Writers
        # Assumes each cell is either an empty array or a NumPy array of floats.
        all_directors = []
        for directors_arr in movies_df['directors']:
            if isinstance(directors_arr, np.ndarray):
                if directors_arr.size > 0:
                    all_directors.extend(directors_arr.tolist())
        if all_directors:
            director_counts = pd.Series(all_directors).value_counts()
            stats['top_directors'] = np.array(director_counts.head(3).index, dtype=np.float32)
        else:
            stats['top_directors'] = np.array([], dtype=np.float32)
        
        all_writers = []
        for writers_arr in movies_df['writers']:
            if isinstance(writers_arr, np.ndarray):
                if writers_arr.size > 0:
                    all_writers.extend(writers_arr.tolist())
        if all_writers:
            writer_counts = pd.Series(all_writers).value_counts()
            stats['top_writers'] = np.array(writer_counts.head(3).index, dtype=np.float32)
        else:
            stats['top_writers'] = np.array([], dtype=np.float32)
        
        return stats

    # --------------------------------------------------------------------------
    # 4. Main user-profile creation
    # --------------------------------------------------------------------------
    user_profiles = []
    
    # Define your rating buckets (ratings are between 0 and 5)
    buckets = [(0, 1), (2, 3), (4, 5)]
    
    unique_users = ratings_df['userId'].unique()
    pbar = tqdm(unique_users, desc="Creating user profiles")

    for user_id in pbar:
        user_ratings = ratings_df[ratings_df['userId'] == user_id]
        total_ratings = len(user_ratings)
        
        # Basic info about the user's rating behavior.
        profile = {
            'userId': np.int32(user_id),
            'total_ratings': np.int32(total_ratings),
            'rating_avg': np.float32(user_ratings['rating'].mean()),
            'rating_std': np.float32(user_ratings['rating'].std()),
            'rating_proportions': {k: np.float32(v) for k, v in user_ratings['rating'].value_counts(normalize=True).to_dict().items()}
        }
        
        # Store references for bucket stats
        bucket_counts = {}
        bucket_stats_dict = {}
        
        # Process each rating bucket.
        for low, high in buckets:
            bucket_name = f'bucket_{low}_{high}'
            bucket_ratings = user_ratings[
                (user_ratings['rating'] >= low) & (user_ratings['rating'] <= high)
            ]
            
            # Find the movies in this bucket.
            bucket_movies = movie_features[
                movie_features['tconst'].isin(bucket_ratings['imdbId'])
            ]
            
            # Gather stats.
            bucket_stats = get_bucket_stats(bucket_movies)
            bucket_counts[bucket_name] = np.int32(bucket_stats.get('count', 0))
            bucket_stats_dict[bucket_name] = bucket_stats
            
            # Add each stat to the user profile with a prefix.
            for stat_name, stat_value in bucket_stats.items():
                profile[f'{bucket_name}_{stat_name}'] = stat_value
        
        # ----------------------------------------------------------------------
        # 5. Interaction features (ratios, differences, etc.) between buckets.
        # ----------------------------------------------------------------------
        for i, (low1, high1) in enumerate(buckets):
            for j, (low2, high2) in enumerate(buckets[i+1:], i+1):
                bucket1_name = f'bucket_{low1}_{high1}'
                bucket2_name = f'bucket_{low2}_{high2}'

                # Ratio of counts.
                ratio_name = f'ratio_{bucket1_name}_to_{bucket2_name}'
                denom = np.float32(bucket_counts[bucket2_name])
                profile[ratio_name] = np.float32(bucket_counts[bucket1_name] / denom if denom > 0 else 0)
                
                # Differences of average stats (year/runtime/rating).
                for stat in ['year_avg', 'runtime_avg', 'rating_avg']:
                    val1 = bucket_stats_dict[bucket1_name].get(stat)
                    val2 = bucket_stats_dict[bucket2_name].get(stat)
                    if val1 is not None and val2 is not None:
                        diff_name = f'{stat}_diff_{bucket1_name}_vs_{bucket2_name}'
                        profile[diff_name] = np.float32(val1 - val2)
        
        # Another example: ratio of high-ratings to low-ratings.
        # (Using bucket names that exist.)
        high_count = np.int32(bucket_counts.get('bucket_7_8', 0) + bucket_counts.get('bucket_9_10', 0))
        low_count = np.int32(bucket_counts.get('bucket_0_2', 0) + bucket_counts.get('bucket_3_4', 0))
        profile['high_vs_low_ratio'] = np.float32(high_count / (np.float32(low_count) + 1e-6))
        
        # ----------------------------------------------------------------------
        # 6. Overall stats for all movies the user rated.
        # ----------------------------------------------------------------------
        all_movies = movie_features[
            movie_features['tconst'].isin(user_ratings['imdbId'])
        ]
        overall_stats = get_bucket_stats(all_movies)
        for k, v in overall_stats.items():
            profile[f'overall_{k}'] = v
        
        user_profiles.append(profile)

    pbar.close()
    
    # Convert list of dicts into a DataFrame.
    return pd.DataFrame(user_profiles)

# ---------------------------------------------------------------------
# Example Usage in a Notebook (after you define `lens_data` and `movie_features`)
# ---------------------------------------------------------------------
enhanced_user_profiles2 = create_enhanced_user_profiles(lens_data, movie_features)
display(enhanced_user_profiles2.head())


Creating user profiles: 100%|█████████| 200948/200948 [1:02:49<00:00, 53.31it/s]


Unnamed: 0,userId,total_ratings,rating_avg,rating_std,rating_proportions,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_runtime_avg,bucket_0_1_rating_avg,...,high_vs_low_ratio,overall_movie_ids,overall_count,overall_year_avg,overall_runtime_avg,overall_rating_avg,overall_numvotes_avg,overall_genre_proportions,overall_top_directors,overall_top_writers
0,1,141,3.531915,1.537871,"{5.0: 0.40425533, 4.0: 0.18439716, 1.0: 0.1843...","[56172, 59113, 71853, 79367, 79470, 88933, 908...",26.0,1989.230713,126.730766,7.511539,...,0.0,"[32143, 33467, 34583, 36868, 42192, 42546, 457...",141,1984.652466,122.609932,7.678013,316521.46875,"{'Drama': 0.3125, 'Comedy': 0.14772727, 'Roman...","[652.0, 109.0, 188.0]","[574.0, 84.0, 188.0]"
1,2,52,4.269231,1.122242,"{5.0: 0.59615386, 4.0: 0.21153846, 3.0: 0.1153...","[109040, 110148, 110912]",3.0,1994.0,121.0,7.766666,...,0.0,"[29583, 65421, 96895, 99653, 99785, 100405, 10...",52,1992.192261,110.67308,6.942307,344699.65625,"{'Drama': 0.23966943, 'Comedy': 0.19008264, 'R...","[434.0, 41484.0, 667.0]","[25642.0, 14992.0, 16818.0]"
2,3,147,3.588435,1.014789,"{4.0: 0.43537414, 3.0: 0.18367347, 5.0: 0.1292...","[60782, 88161, 102216, 102926, 107144, 109040,...",9.0,1989.444458,101.111115,6.677778,...,0.0,"[32910, 38650, 49833, 54033, 57076, 58150, 607...",147,1991.408203,119.258507,7.182313,405335.15625,"{'Adventure': 0.16497461, 'Action': 0.15228426...","[307.0, 109.0, 17524.0]","[16818.0, 25642.0, 21373.0]"
3,4,27,2.62963,1.043225,"{2.0: 0.4074074, 3.0: 0.25925925, 4.0: 0.18518...","[97162, 150377, 151137]",3.0,1995.666626,116.333336,6.566667,...,0.0,"[66206, 78767, 86190, 87469, 91530, 97162, 990...",27,1994.518555,109.0,6.744443,170791.265625,"{'Action': 0.14285715, 'Horror': 0.114285715, ...","[613.0, 34314.0, 39297.0]","[84.0, 17198.0, 150.0]"
4,5,33,3.272727,0.910794,"{3.0: 0.4848485, 4.0: 0.33333334, 2.0: 0.06060...","[110912, 111161]",2.0,1994.0,148.0,9.1,...,0.0,"[96895, 99348, 101414, 102926, 103639, 106220,...",33,1993.666626,124.63636,7.466667,638410.0625,"{'Drama': 0.1954023, 'Action': 0.1724138, 'Adv...","[139.0, 26914.0, 112.0]","[33956.0, 25642.0, 16818.0]"


In [44]:
total_cols = len(enhanced_user_profiles2.columns)
chunk_size = 18

for i in range(0, total_cols, chunk_size):
    # Get the slice of columns for this chunk
    cols_chunk = enhanced_user_profiles2.columns[i:i + chunk_size]
    print(f"\nColumns {i} to {min(i + chunk_size, total_cols)}:")
    print("--------------------")
    print(cols_chunk.tolist())
    print("\nSample data:")
    print(enhanced_user_profiles2[cols_chunk].head())
    print("\n" + "="*80 + "\n")  # Separator between chunks


Columns 0 to 18:
--------------------
['userId', 'total_ratings', 'rating_avg', 'rating_std', 'rating_proportions', 'bucket_0_1_movie_ids', 'bucket_0_1_count', 'bucket_0_1_year_avg', 'bucket_0_1_runtime_avg', 'bucket_0_1_rating_avg', 'bucket_0_1_numvotes_avg', 'bucket_0_1_genre_proportions', 'bucket_0_1_top_directors', 'bucket_0_1_top_writers', 'bucket_2_3_movie_ids', 'bucket_2_3_count', 'bucket_2_3_year_avg', 'bucket_2_3_runtime_avg']

Sample data:
   userId  total_ratings  rating_avg  rating_std  \
0       1            141    3.531915    1.537871   
1       2             52    4.269231    1.122242   
2       3            147    3.588435    1.014789   
3       4             27    2.629630    1.043225   
4       5             33    3.272727    0.910794   

                                  rating_proportions  \
0  {5.0: 0.40425533, 4.0: 0.18439716, 1.0: 0.1843...   
1  {5.0: 0.59615386, 4.0: 0.21153846, 3.0: 0.1153...   
2  {4.0: 0.43537414, 3.0: 0.18367347, 5.0: 0.1292...   
3  {2.0:

In [45]:
enhanced_user_profiles2.to_pickle('./Datasets/Features3.10/user_profiles_2.0.pkl')
#enhanced_user_profiles2.info()

In [96]:
#enhanced_user_profiles2 = pd.read_pickle('./Datasets/Features3.10/user_profiles_2.0.pkl')
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Data columns (total 54 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   userId                                     200948 non-null  int32  
 1   total_ratings                              200948 non-null  int32  
 2   rating_avg                                 200948 non-null  float32
 3   rating_std                                 200948 non-null  float32
 4   rating_proportions                         200948 non-null  object 
 5   bucket_0_1_movie_ids                       128663 non-null  object 
 6   bucket_0_1_count                           128663 non-null  float64
 7   bucket_0_1_year_avg                        128663 non-null  float64
 8   bucket_0_1_runtime_avg                     128663 non-null  float64
 9   bucket_0_1_rating_avg                      128663 non-null  float64
 10  bucket_0

In [97]:
def flatten_dictionary_columns(df):
    """
    Detects and flattens dictionary columns in a DataFrame using the apply(pd.Series) approach.
    
    For each column where the first non-null value is a dictionary, this function:
      - Expands the dictionary into multiple columns (one for each key)
      - Fills missing values with 0
      - Prefixes the new column names with the original column name
      - Concatenates the new columns to the DataFrame and drops the original dictionary column
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
    
    Returns:
        pd.DataFrame: The DataFrame with dictionary columns flattened.
    """
    # Identify columns where the first non-null value is a dict
    dict_cols = []
    for col in df.columns:
        # Only check if the column has at least one non-null value
        non_null = df[col].dropna()
        if not non_null.empty and isinstance(non_null.iloc[0], dict):
            dict_cols.append(col)
    
    # Process each dictionary column
    for col in dict_cols:
        # Expand the dictionaries in the column into a DataFrame
        expanded = df[col].apply(lambda x: pd.Series(x) if isinstance(x, dict) else pd.Series())
        # Fill missing values with 0 and prefix column names to avoid collisions
        expanded = expanded.fillna(0).add_prefix(f"{col}_")
        # Concatenate the new columns with the original DataFrame
        df = pd.concat([df, expanded], axis=1)
        # Drop the original dictionary column
        df.drop(columns=[col], inplace=True)
        
    return df

In [98]:
# Apply to your dataframe
enhanced_user_profiles2 = flatten_dictionary_columns(enhanced_user_profiles2)

# Print results
print(enhanced_user_profiles2.info())
enhanced_user_profiles2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Columns: 170 entries, userId to overall_genre_proportions_Game-Show
dtypes: float32(48), float64(107), int32(3), object(12)
memory usage: 221.5+ MB
None


Unnamed: 0,userId,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_runtime_avg,bucket_0_1_rating_avg,bucket_0_1_numvotes_avg,...,overall_genre_proportions_Western,overall_genre_proportions_Animation,overall_genre_proportions_Musical,overall_genre_proportions_Film-Noir,overall_genre_proportions_Short,overall_genre_proportions_News,overall_genre_proportions_Reality-TV,overall_genre_proportions_Adult,overall_genre_proportions_Talk-Show,overall_genre_proportions_Game-Show
0,1,141,3.531915,1.537871,"[56172, 59113, 71853, 79367, 79470, 88933, 908...",26.0,1989.230713,126.730766,7.511539,2.207891e+05,...,0.002841,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,52,4.269231,1.122242,"[109040, 110148, 110912]",3.0,1994.000000,121.000000,7.766666,1.000805e+06,...,0.000000,0.066116,0.008264,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,147,3.588435,1.014789,"[60782, 88161, 102216, 102926, 107144, 109040,...",9.0,1989.444458,101.111115,6.677778,3.198134e+05,...,0.002538,0.032995,0.002538,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,27,2.629630,1.043225,"[97162, 150377, 151137]",3.0,1995.666626,116.333336,6.566667,7.187000e+04,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,33,3.272727,0.910794,"[110912, 111161]",2.0,1994.000000,148.000000,9.100000,2.656763e+06,...,0.011494,0.034483,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200943,200944,298,3.947987,0.620752,,,,,,,...,0.001208,0.022947,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200944,200945,108,3.101852,1.901499,"[93058, 97814, 105236, 110912, 112431, 116367,...",37.0,2004.540527,124.405403,7.799999,8.952797e+05,...,0.003690,0.014760,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200945,200946,23,4.391304,0.583027,,,,,,,...,0.000000,0.048387,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200946,200947,61,3.950820,0.650032,,,,,,,...,0.012121,0.072727,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
e

In [99]:
# Get all overall genre proportion columns
overall_cols = [col for col in enhanced_user_profiles2.columns if col.startswith("overall_genre_proportions_")]
print(overall_cols)
# Compute the mean for each overall genre column and sort descending
overall_means = enhanced_user_profiles2[overall_cols].mean().sort_values(ascending=False)

# Select the top 10 overall genre columns
top10_overall = overall_means.head(10).index.tolist()
top10_genres = [col.replace("overall_genre_proportions_", "") for col in top10_overall]

top10_genres

['overall_genre_proportions_Drama', 'overall_genre_proportions_Comedy', 'overall_genre_proportions_Romance', 'overall_genre_proportions_Crime', 'overall_genre_proportions_Biography', 'overall_genre_proportions_War', 'overall_genre_proportions_Adventure', 'overall_genre_proportions_Sci-Fi', 'overall_genre_proportions_Action', 'overall_genre_proportions_Thriller', 'overall_genre_proportions_Mystery', 'overall_genre_proportions_Fantasy', 'overall_genre_proportions_History', 'overall_genre_proportions_Music', 'overall_genre_proportions_Horror', 'overall_genre_proportions_Family', 'overall_genre_proportions_Sport', 'overall_genre_proportions_Documentary', 'overall_genre_proportions_Western', 'overall_genre_proportions_Animation', 'overall_genre_proportions_Musical', 'overall_genre_proportions_Film-Noir', 'overall_genre_proportions_Short', 'overall_genre_proportions_News', 'overall_genre_proportions_Reality-TV', 'overall_genre_proportions_Adult', 'overall_genre_proportions_Talk-Show', 'overa

['Drama',
 'Comedy',
 'Adventure',
 'Action',
 'Crime',
 'Thriller',
 'Romance',
 'Sci-Fi',
 'Fantasy',
 'Mystery']

In [100]:
overall_other_cols = [col for col in overall_cols if col not in top10_overall]
overall_other_cols

['overall_genre_proportions_Biography',
 'overall_genre_proportions_War',
 'overall_genre_proportions_History',
 'overall_genre_proportions_Music',
 'overall_genre_proportions_Horror',
 'overall_genre_proportions_Family',
 'overall_genre_proportions_Sport',
 'overall_genre_proportions_Documentary',
 'overall_genre_proportions_Western',
 'overall_genre_proportions_Animation',
 'overall_genre_proportions_Musical',
 'overall_genre_proportions_Film-Noir',
 'overall_genre_proportions_Short',
 'overall_genre_proportions_News',
 'overall_genre_proportions_Reality-TV',
 'overall_genre_proportions_Adult',
 'overall_genre_proportions_Talk-Show',
 'overall_genre_proportions_Game-Show']

In [105]:
enhanced_user_profiles2['overall_other_genre'] = enhanced_user_profiles2[overall_other_cols].sum(axis=1)

enhanced_user_profiles2.drop(columns=overall_other_cols)


Unnamed: 0,userId,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_runtime_avg,bucket_0_1_rating_avg,bucket_0_1_numvotes_avg,...,overall_genre_proportions_Comedy,overall_genre_proportions_Romance,overall_genre_proportions_Crime,overall_genre_proportions_Adventure,overall_genre_proportions_Sci-Fi,overall_genre_proportions_Action,overall_genre_proportions_Thriller,overall_genre_proportions_Mystery,overall_genre_proportions_Fantasy,overall_other_genre
0,1,141,3.531915,1.537871,"[56172, 59113, 71853, 79367, 79470, 88933, 908...",26.0,1989.230713,126.730766,7.511539,2.207891e+05,...,0.147727,0.107955,0.059659,0.045455,0.036932,0.036932,0.031250,0.031250,0.028409,0.161932
1,2,52,4.269231,1.122242,"[109040, 110148, 110912]",3.0,1994.000000,121.000000,7.766666,1.000805e+06,...,0.190083,0.123967,0.041322,0.090909,0.000000,0.074380,0.033058,0.016529,0.041322,0.148760
2,3,147,3.588435,1.014789,"[60782, 88161, 102216, 102926, 107144, 109040,...",9.0,1989.444458,101.111115,6.677778,3.198134e+05,...,0.144670,0.060914,0.032995,0.164975,0.071066,0.152284,0.040609,0.012690,0.038071,0.137056
3,4,27,2.629630,1.043225,"[97162, 150377, 151137]",3.0,1995.666626,116.333336,6.566667,7.187000e+04,...,0.114286,0.028571,0.085714,0.114286,0.057143,0.142857,0.057143,0.085714,0.028571,0.185714
4,5,33,3.272727,0.910794,"[110912, 111161]",2.0,1994.000000,148.000000,9.100000,2.656763e+06,...,0.068966,0.034483,0.080460,0.149425,0.034483,0.172414,0.080460,0.034483,0.034483,0.114943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200943,200944,298,3.947987,0.620752,,,,,,,...,0.074879,0.012077,0.061594,0.170290,0.115942,0.240338,0.077295,0.030193,0.045894,0.065217
200944,200945,108,3.101852,1.901499,"[93058, 97814, 105236, 110912, 112431, 116367,...",37.0,2004.540527,124.405403,7.799999,8.952797e+05,...,0.121771,0.059041,0.088561,0.059041,0.055351,0.059041,0.066421,0.062731,0.025830,0.092251
200945,200946,23,4.391304,0.583027,,,,,,,...,0.064516,0.048387,0.064516,0.177419,0.032258,0.145161,0.096774,0.000000,0.032258,0.129032
200946,200947,61,3.950820,0.650032,,,,,,,...,0.090909,0.042424,0.060606,0.139394,0.048485,0.066667,0.048485,0.030303,0.054545,0.187879


In [106]:
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Columns: 171 entries, userId to overall_other_genre
dtypes: float32(49), float64(107), int32(3), object(12)
memory usage: 222.3+ MB


In [107]:
buckets = ["bucket_0_1", "bucket_2_3", "bucket_4_5"]

for bucket in buckets:
    # Get bucket genre proportion columns for the current bucket
    bucket_cols = [col for col in enhanced_user_profiles2.columns if col.startswith(f"{bucket}_genre_proportions_")]
    
    # Decide which columns to keep based on overall top 10 genres (comparing genre names)
    keep_cols = []
    drop_cols = []
    for col in bucket_cols:
        # Remove bucket prefix to get the genre name
        genre = col.replace(f"{bucket}_genre_proportions_", "")
        if genre in top10_genres:
            keep_cols.append(col)
        else:
            drop_cols.append(col)
    
    # If there are any columns to drop, combine them into a new "other" column
    if drop_cols:
        other_col_name = f"{bucket}_other_genre"
        enhanced_user_profiles2[other_col_name] = enhanced_user_profiles2[drop_cols].sum(axis=1)
        # Drop the original non-top10 bucket columns
        enhanced_user_profiles2.drop(columns=drop_cols, inplace=True)

In [108]:
enhanced_user_profiles2

Unnamed: 0,userId,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_runtime_avg,bucket_0_1_rating_avg,bucket_0_1_numvotes_avg,...,overall_genre_proportions_Short,overall_genre_proportions_News,overall_genre_proportions_Reality-TV,overall_genre_proportions_Adult,overall_genre_proportions_Talk-Show,overall_genre_proportions_Game-Show,overall_other_genre,bucket_0_1_other_genre,bucket_2_3_other_genre,bucket_4_5_other_genre
0,1,141,3.531915,1.537871,"[56172, 59113, 71853, 79367, 79470, 88933, 908...",26.0,1989.230713,126.730766,7.511539,2.207891e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.161932,0.203125,0.168831,0.146919
1,2,52,4.269231,1.122242,"[109040, 110148, 110912]",3.0,1994.000000,121.000000,7.766666,1.000805e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.148760,0.166667,0.076923,0.156863
2,3,147,3.588435,1.014789,"[60782, 88161, 102216, 102926, 107144, 109040,...",9.0,1989.444458,101.111115,6.677778,3.198134e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.137056,0.052632,0.069307,0.165254
3,4,27,2.629630,1.043225,"[97162, 150377, 151137]",3.0,1995.666626,116.333336,6.566667,7.187000e+04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.185714,0.250000,0.163265,0.230769
4,5,33,3.272727,0.910794,"[110912, 111161]",2.0,1994.000000,148.000000,9.100000,2.656763e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.114943,0.000000,0.148936,0.081081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200943,200944,298,3.947987,0.620752,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.065217,0.000000,0.070707,0.058824
200944,200945,108,3.101852,1.901499,"[93058, 97814, 105236, 110912, 112431, 116367,...",37.0,2004.540527,124.405403,7.799999,8.952797e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.092251,0.140000,0.090909,0.066667
200945,200946,23,4.391304,0.583027,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.129032,0.000000,0.000000,0.133333
200946,200947,61,3.950820,0.650032,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.187879,0.000000,0.241379,0.188119


In [109]:
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Columns: 121 entries, userId to bucket_4_5_other_genre
dtypes: float32(49), float64(57), int32(3), object(12)
memory usage: 145.6+ MB


In [110]:
for col in enhanced_user_profiles2.select_dtypes(include=['int64']).columns:
    enhanced_user_profiles2[col] = enhanced_user_profiles2[col].astype(np.int32)

# Convert float64 columns to float32
for col in enhanced_user_profiles2.select_dtypes(include=['float64']).columns:
    enhanced_user_profiles2[col] = enhanced_user_profiles2[col].astype(np.float32)

In [111]:
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Columns: 121 entries, userId to bucket_4_5_other_genre
dtypes: float32(106), int32(3), object(12)
memory usage: 102.0+ MB


In [112]:
enhanced_user_profiles2.to_pickle("./Datasets/Features3.10/flattened_users.pkl")

## Normalize Data

#### Load Dataframe

In [113]:
total_memory = enhanced_user_profiles2.memory_usage(deep=True).sum() / (1024**3)
print(f"Total memory usage: {total_memory} bytes")

Total memory usage: 0.3318304643034935 bytes


#### Normalization 

In [114]:
for column in enhanced_user_profiles2.columns:
    print(column)
    print(enhanced_user_profiles2[column].head(5))

userId
0    1
1    2
2    3
3    4
4    5
Name: userId, dtype: int32
total_ratings
0    141
1     52
2    147
3     27
4     33
Name: total_ratings, dtype: int32
rating_avg
0    3.531915
1    4.269231
2    3.588435
3    2.629630
4    3.272727
Name: rating_avg, dtype: float32
rating_std
0    1.537871
1    1.122242
2    1.014789
3    1.043225
4    0.910794
Name: rating_std, dtype: float32
bucket_0_1_movie_ids
0    [56172, 59113, 71853, 79367, 79470, 88933, 908...
1                             [109040, 110148, 110912]
2    [60782, 88161, 102216, 102926, 107144, 109040,...
3                              [97162, 150377, 151137]
4                                     [110912, 111161]
Name: bucket_0_1_movie_ids, dtype: object
bucket_0_1_count
0    26.0
1     3.0
2     9.0
3     3.0
4     2.0
Name: bucket_0_1_count, dtype: float32
bucket_0_1_year_avg
0    1989.230713
1    1994.000000
2    1989.444458
3    1995.666626
4    1994.000000
Name: bucket_0_1_year_avg, dtype: float32
bucket_0_1_runtime_

In [115]:
overall_columns = [col for col in enhanced_user_profiles2.columns if "overall" in col]
print(overall_columns)

['overall_movie_ids', 'overall_count', 'overall_year_avg', 'overall_runtime_avg', 'overall_rating_avg', 'overall_numvotes_avg', 'overall_top_directors', 'overall_top_writers', 'overall_genre_proportions_Drama', 'overall_genre_proportions_Comedy', 'overall_genre_proportions_Romance', 'overall_genre_proportions_Crime', 'overall_genre_proportions_Biography', 'overall_genre_proportions_War', 'overall_genre_proportions_Adventure', 'overall_genre_proportions_Sci-Fi', 'overall_genre_proportions_Action', 'overall_genre_proportions_Thriller', 'overall_genre_proportions_Mystery', 'overall_genre_proportions_Fantasy', 'overall_genre_proportions_History', 'overall_genre_proportions_Music', 'overall_genre_proportions_Horror', 'overall_genre_proportions_Family', 'overall_genre_proportions_Sport', 'overall_genre_proportions_Documentary', 'overall_genre_proportions_Western', 'overall_genre_proportions_Animation', 'overall_genre_proportions_Musical', 'overall_genre_proportions_Film-Noir', 'overall_genre

In [116]:
valid_overall_columns = [col for col in overall_columns if col in enhanced_user_profiles2.columns]
enhanced_user_profiles2.drop(valid_overall_columns, axis=1, inplace=True)

In [117]:
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Data columns (total 84 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   userId                                     200948 non-null  int32  
 1   total_ratings                              200948 non-null  int32  
 2   rating_avg                                 200948 non-null  float32
 3   rating_std                                 200948 non-null  float32
 4   bucket_0_1_movie_ids                       128663 non-null  object 
 5   bucket_0_1_count                           128663 non-null  float32
 6   bucket_0_1_year_avg                        128663 non-null  float32
 7   bucket_0_1_runtime_avg                     128663 non-null  float32
 8   bucket_0_1_rating_avg                      128663 non-null  float32
 9   bucket_0_1_numvotes_avg                    128663 non-null  float32
 10  bucket_0

In [118]:
total_memory = enhanced_user_profiles2.memory_usage(deep=True).sum() / (1024**3)
print(f"Total memory usage: {total_memory} bytes")

Total memory usage: 0.23900534957647324 bytes


In [119]:
enhanced_user_profiles2.copy().to_pickle("./Datasets/Features3.10/flattened_users_smol.pkl")

In [66]:
enhanced_user_profiles2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Data columns (total 45 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   userId                                     200948 non-null  int32  
 1   total_ratings                              200948 non-null  int32  
 2   rating_avg                                 200948 non-null  float32
 3   rating_std                                 200948 non-null  float32
 4   rating_proportions                         200948 non-null  object 
 5   bucket_0_1_movie_ids                       128663 non-null  object 
 6   bucket_0_1_count                           128663 non-null  float32
 7   bucket_0_1_year_avg                        128663 non-null  float32
 8   bucket_0_1_runtime_avg                     128663 non-null  float32
 9   bucket_0_1_rating_avg                      128663 non-null  float32
 10  bucket_0

In [120]:
movie_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83955 entries, 0 to 83954
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          83955 non-null  int32  
 1   startYear       83954 non-null  float32
 2   runtimeMinutes  83504 non-null  float32
 3   genres          83955 non-null  object 
 4   tconst_numeric  83955 non-null  float32
 5   averageRating   83955 non-null  float32
 6   numVotes        83955 non-null  float32
 7   directors       83955 non-null  object 
 8   writers         83955 non-null  object 
dtypes: float32(5), int32(1), object(3)
memory usage: 3.8+ MB


In [123]:
movie_features["genres"]

'Documentary'

In [124]:
map_ids_to_contiguous_indices(movie_features, ["genres"])
movie_features["genres"]

0             [7.0, 22.0]
1        [3.0, 5.0, 20.0]
2            [22.0, 23.0]
3             [7.0, 22.0]
4             [7.0, 22.0]
               ...       
83950          [5.0, 8.0]
83951               [5.0]
83952               [7.0]
83953              [25.0]
83954         [8.0, 13.0]
Name: genres, Length: 83955, dtype: object

In [126]:
movie_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83955 entries, 0 to 83954
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          83955 non-null  int32  
 1   startYear       83954 non-null  float32
 2   runtimeMinutes  83504 non-null  float32
 3   genres          83955 non-null  object 
 4   tconst_numeric  83955 non-null  float32
 5   averageRating   83955 non-null  float32
 6   numVotes        83955 non-null  float32
 7   directors       83955 non-null  object 
 8   writers         83955 non-null  object 
dtypes: float32(5), int32(1), object(3)
memory usage: 3.8+ MB


In [128]:
movie_features.copy().to_pickle("./Datasets/features3.10/movie_feat_genre_.pkl")