In [14]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

In [2]:
user_profiles_df = pd.read_pickle("./Datasets/Features3.10/flattened_users_smol.pkl").copy()
movie_features_df = pd.read_pickle("./Datasets/Features3.10/movie_feat_genre_.pkl").copy()
ratings_df = pd.read_pickle("./Datasets/Features3.10/ratings.pkl").copy()


In [3]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31972560 entries, 0 to 31972559
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float32
 2   imdbId  int32  
dtypes: float32(1), int32(1), int64(1)
memory usage: 487.9 MB


In [4]:
ratings_df["rating"] = ratings_df["rating"].astype("float32")
ratings_df["userId"] = ratings_df["userId"].astype("int32")
ratings_df["imdbId"] = ratings_df["imdbId"].astype("int32")
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31972560 entries, 0 to 31972559
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int32  
 1   rating  float32
 2   imdbId  int32  
dtypes: float32(1), int32(2)
memory usage: 365.9 MB


In [5]:
movie_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83955 entries, 0 to 83954
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          83955 non-null  int32  
 1   startYear       83954 non-null  float32
 2   runtimeMinutes  83504 non-null  float32
 3   genres          83955 non-null  object 
 4   tconst_numeric  83955 non-null  float32
 5   averageRating   83955 non-null  float32
 6   numVotes        83955 non-null  float32
 7   directors       83955 non-null  object 
 8   writers         83955 non-null  object 
dtypes: float32(5), int32(1), object(3)
memory usage: 3.8+ MB


In [6]:
user_profiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200948 entries, 0 to 200947
Data columns (total 84 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   userId                                     200948 non-null  int32  
 1   total_ratings                              200948 non-null  int32  
 2   rating_avg                                 200948 non-null  float32
 3   rating_std                                 200948 non-null  float32
 4   bucket_0_1_movie_ids                       128663 non-null  object 
 5   bucket_0_1_count                           128663 non-null  float32
 6   bucket_0_1_year_avg                        128663 non-null  float32
 7   bucket_0_1_runtime_avg                     128663 non-null  float32
 8   bucket_0_1_rating_avg                      128663 non-null  float32
 9   bucket_0_1_numvotes_avg                    128663 non-null  float32
 10  bucket_0

### Combine All Features in One Jupyter Notebook to Prevent Memory Crashes

In [7]:
def create_combined_features_table(ratings_df, user_profiles_df, movie_features_df):
    print("Starting to create combined features table...")

    # Process in chunks
    chunk_size = 1000000  # Adjust based on your available memory
    combined_dfs = []
    
    for chunk_start in tqdm(range(0, len(ratings_df), chunk_size), desc = "chunk processing:" ):
        chunk_end = chunk_start + chunk_size
        ratings_chunk = ratings_df.iloc[chunk_start:chunk_end]
        
        # Merge chunk with user profiles
        combined_chunk = ratings_chunk.merge(user_profiles_df, on="userId", how="left")
        combined_chunk = combined_chunk.merge(movie_features_df, left_on="imdbId", right_on="tconst", how="left")
        
        combined_dfs.append(combined_chunk)
        gc.collect()  # Free memory after each chunk
    
    # Concatenate all chunks
    final_df = pd.concat(combined_dfs, ignore_index=True)
    return final_df

In [8]:
combined_features = create_combined_features_table(
    ratings_df=ratings_df,
    user_profiles_df=user_profiles_df,
    movie_features_df=movie_features_df
)

Starting to create combined features table...


chunk processing:: 100%|████████████████████████| 32/32 [00:41<00:00,  1.28s/it]


In [9]:
pd.to_pickle(combined_features.copy(), "./Datasets/Features3.10/combined_features_xsmall.pkl")

In [13]:
print(len(combined_features.columns))
print(combined_features.info())
for col in combined_features.columns:
    print(combined_features[col].dtype)
    #print(combined_data[col].isnull().sum())
    #|print(combined_features[col].describe())

95
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31972560 entries, 0 to 31972559
Data columns (total 95 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   userId                                     int32  
 1   rating                                     float32
 2   imdbId                                     int32  
 3   total_ratings                              int32  
 4   rating_avg                                 float32
 5   rating_std                                 float32
 6   bucket_0_1_movie_ids                       object 
 7   bucket_0_1_count                           float32
 8   bucket_0_1_year_avg                        float32
 9   bucket_0_1_runtime_avg                     float32
 10  bucket_0_1_rating_avg                      float32
 11  bucket_0_1_numvotes_avg                    float32
 12  bucket_0_1_top_directors                   object 
 13  bucket_0_1_top_writers               

In [11]:
### Drop a few columns Train for two days.
### Only shot I have the resources for it
### Remove columns though with already high correlation with each other.
### Perhaps best approach
### Find unnecessary columns
len(combined_features.columns)

95

In [12]:
# Calculate deep memory usage of the DataFrame
total_memory_bytes = combined_features.memory_usage(deep=True).sum()

# Convert bytes to gigabytes
total_memory_gb = total_memory_bytes / (1024**3)

print(f"Total memory usage: {total_memory_gb:.4f} GB")

Total memory usage: 51.2854 GB


In [20]:
print(combined_features["imdbId"].max())
print(combined_features["imdbId"].min())
len(combined_features["imdbId"].unique())

28995566
1


83955

### Reload Combined_featuers

In [None]:
combined_features = pd.read_pickle("./Datasets/Features3.10/combined_features.pkl")

In [47]:
combined_features.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'imdbId', 'total_ratings',
       'rating_avg', 'rating_std', 'bucket_0_1_movie_ids', 'bucket_0_1_count',
       ...
       'tconst', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes',
       'genres', 'averageRating', 'numVotes', 'directors', 'writers'],
      dtype='object', length=204)

### Remove userid 
We won't need it

In [8]:
combined_features.drop(["userId"], inplace= True, axis = 1)

### Check movieId and Tconst and ImdbID  

In [9]:
combined_features.columns

Index(['movieId', 'rating', 'timestamp', 'imdbId', 'total_ratings',
       'rating_avg', 'rating_std', 'bucket_0_1_movie_ids', 'bucket_0_1_count',
       'bucket_0_1_adult_prop',
       ...
       'tconst', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes',
       'genres', 'averageRating', 'numVotes', 'directors', 'writers'],
      dtype='object', length=203)

In [10]:
print(combined_features["movieId"])
print(combined_features["imdbId"])
print(combined_features["tconst"])

0              17
1              25
2              29
3              30
4              32
            ...  
31972555    79702
31972556    79796
31972557    80350
31972558    80463
31972559    87304
Name: movieId, Length: 31972560, dtype: int64
0            114388
1            113627
2            112682
3            115012
4            114746
             ...   
31972555     446029
31972556    1020558
31972557    1666186
31972558    1285016
31972559    1532503
Name: imdbId, Length: 31972560, dtype: int32
0            114388
1            113627
2            112682
3            115012
4            114746
             ...   
31972555     446029
31972556    1020558
31972557    1666186
31972558    1285016
31972559    1532503
Name: tconst, Length: 31972560, dtype: int32


In [11]:
combined_features.drop(["movieId", "imdbId"], inplace =True, axis = 1)

##### Remove Adult Columns

In [12]:
adult_columns =combined_features.columns[combined_features.columns.str.contains('adult', case = False)]
adult_columns

Index(['bucket_0_1_adult_prop', 'bucket_2_3_adult_prop',
       'bucket_4_5_adult_prop', 'overall_adult_prop',
       'bucket_0_1_genre_proportions_Adult',
       'bucket_2_3_genre_proportions_Adult',
       'bucket_4_5_genre_proportions_Adult', 'overall_genre_proportions_Adult',
       'isAdult'],
      dtype='object')

In [13]:
combined_features.drop(adult_columns, inplace = True, axis = 1)
combined_features

Unnamed: 0,rating,timestamp,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_year_var,bucket_0_1_runtime_avg,...,overall_genre_proportions_War,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,4.0,944249077,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,...,0.045455,114388,Sense and Sensibility,1995,136,"[Drama, Romance]",7.7,130226,[nm0000487],"[nm0000807, nm0000668]"
1,1.0,944250228,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,...,0.045455,113627,Leaving Las Vegas,1995,111,"[Drama, Romance]",7.5,139377,[nm0001214],"[nm3840714, nm0001214]"
2,2.0,943230976,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,...,0.045455,112682,The City of Lost Children,1995,112,"[Adventure, Drama, Fantasy]",7.5,72979,"[nm0001988, nm0000466]","[nm0012496, nm0000466, nm0001988, nm0491011]"
3,5.0,944249077,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,...,0.045455,115012,Shanghai Triad,1995,108,"[Crime, Drama, History]",7.1,6155,[nm0955443],"[nm0270819, nm0910841, nm0944481]"
4,5.0,943228858,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,...,0.045455,114746,12 Monkeys,1995,129,"[Mystery, Sci-Fi, Thriller]",8.0,660616,[nm0000416],"[nm0003408, nm0672459, nm0672466]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31972555,4.5,1294412589,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,...,0.001642,446029,Scott Pilgrim vs. the World,2010,112,"[Action, Comedy, Fantasy]",7.5,478444,[nm0942367],"[nm0045209, nm0942367, nm1854069]"
31972556,1.0,1287216292,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,...,0.001642,1020558,Centurion,2010,97,"[Action, Drama, History]",6.3,87775,[nm0551076],[nm0551076]
31972557,0.5,1294412671,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,...,0.001642,1666186,Vampires Suck,2010,82,"[Comedy, Horror]",3.4,52852,"[nm0294997, nm0783536]","[nm0294997, nm0783536]"
31972558,3.5,1350423800,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,...,0.001642,1285016,The Social Network,2010,120,"[Biography, Drama]",7.8,783018,[nm0000399],"[nm0815070, nm0583826]"


### Drop primaryTitle and timestamp

In [14]:
combined_features.drop(["primaryTitle", "timestamp"], inplace = True, axis = 1)
combined_features

Unnamed: 0,rating,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_year_var,bucket_0_1_runtime_avg,bucket_0_1_runtime_var,...,overall_genre_proportions_Game-Show,overall_genre_proportions_War,tconst,startYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,4.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,962.524615,...,0.0,0.045455,114388,1995,136,"[Drama, Romance]",7.7,130226,[nm0000487],"[nm0000807, nm0000668]"
1,1.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,962.524615,...,0.0,0.045455,113627,1995,111,"[Drama, Romance]",7.5,139377,[nm0001214],"[nm3840714, nm0001214]"
2,2.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,962.524615,...,0.0,0.045455,112682,1995,112,"[Adventure, Drama, Fantasy]",7.5,72979,"[nm0001988, nm0000466]","[nm0012496, nm0000466, nm0001988, nm0491011]"
3,5.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,962.524615,...,0.0,0.045455,115012,1995,108,"[Crime, Drama, History]",7.1,6155,[nm0955443],"[nm0270819, nm0910841, nm0944481]"
4,5.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,103.464615,126.730769,962.524615,...,0.0,0.045455,114746,1995,129,"[Mystery, Sci-Fi, Thriller]",8.0,660616,[nm0000416],"[nm0003408, nm0672459, nm0672466]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31972555,4.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,235.647619,...,0.0,0.001642,446029,2010,112,"[Action, Comedy, Fantasy]",7.5,478444,[nm0942367],"[nm0045209, nm0942367, nm1854069]"
31972556,1.0,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,235.647619,...,0.0,0.001642,1020558,2010,97,"[Action, Drama, History]",6.3,87775,[nm0551076],[nm0551076]
31972557,0.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,235.647619,...,0.0,0.001642,1666186,2010,82,"[Comedy, Horror]",3.4,52852,"[nm0294997, nm0783536]","[nm0294997, nm0783536]"
31972558,3.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,117.890476,96.952381,235.647619,...,0.0,0.001642,1285016,2010,120,"[Biography, Drama]",7.8,783018,[nm0000399],"[nm0815070, nm0583826]"


### Remove Genres That Appear Significantly Less

In [15]:
genre_columns = combined_features.columns[combined_features.columns.str.contains("genre")]
genre_columns

Index(['bucket_0_1_genre_proportions_Adventure',
       'bucket_0_1_genre_proportions_Thriller',
       'bucket_0_1_genre_proportions_News',
       'bucket_0_1_genre_proportions_Talk-Show',
       'bucket_0_1_genre_proportions_Romance',
       'bucket_0_1_genre_proportions_Western',
       'bucket_0_1_genre_proportions_Sport',
       'bucket_0_1_genre_proportions_Fantasy',
       'bucket_0_1_genre_proportions_Horror',
       'bucket_0_1_genre_proportions_Crime',
       ...
       'overall_genre_proportions_Reality-TV',
       'overall_genre_proportions_Action',
       'overall_genre_proportions_Documentary',
       'overall_genre_proportions_Animation',
       'overall_genre_proportions_Mystery', 'overall_genre_proportions_Music',
       'overall_genre_proportions_Musical',
       'overall_genre_proportions_Game-Show', 'overall_genre_proportions_War',
       'genres'],
      dtype='object', length=108)

#### Create Other Column For Genres

In [16]:
overall_genre_columns = genre_columns[genre_columns.str.contains("overall")]
overall_genre_columns

Index(['overall_genre_proportions_Adventure',
       'overall_genre_proportions_Thriller', 'overall_genre_proportions_News',
       'overall_genre_proportions_Talk-Show',
       'overall_genre_proportions_Romance',
       'overall_genre_proportions_Western', 'overall_genre_proportions_Sport',
       'overall_genre_proportions_Fantasy', 'overall_genre_proportions_Horror',
       'overall_genre_proportions_Crime', 'overall_genre_proportions_Sci-Fi',
       'overall_genre_proportions_Biography',
       'overall_genre_proportions_History', 'overall_genre_proportions_Family',
       'overall_genre_proportions_Film-Noir',
       'overall_genre_proportions_Comedy', 'overall_genre_proportions_Drama',
       'overall_genre_proportions_Short',
       'overall_genre_proportions_Reality-TV',
       'overall_genre_proportions_Action',
       'overall_genre_proportions_Documentary',
       'overall_genre_proportions_Animation',
       'overall_genre_proportions_Mystery', 'overall_genre_proportions_M

##### Find top 15 columns with highest average

In [17]:
# Get means of all columns and sort in descending order
means = combined_features[overall_genre_columns].mean().sort_values(ascending=False)

# Display top N means (e.g., top 10)
print(means.head(15))

overall_genre_proportions_Drama        0.192170
overall_genre_proportions_Comedy       0.135380
overall_genre_proportions_Action       0.107409
overall_genre_proportions_Adventure    0.106616
overall_genre_proportions_Crime        0.071704
overall_genre_proportions_Thriller     0.056908
overall_genre_proportions_Romance      0.053774
overall_genre_proportions_Sci-Fi       0.050607
overall_genre_proportions_Fantasy      0.039076
overall_genre_proportions_Mystery      0.036953
overall_genre_proportions_Horror       0.028189
overall_genre_proportions_Animation    0.025958
overall_genre_proportions_Family       0.023122
overall_genre_proportions_Biography    0.022253
overall_genre_proportions_War          0.009906
dtype: float64


In [19]:
top15 = means.head(15).keys()
top15

Index(['overall_genre_proportions_Drama', 'overall_genre_proportions_Comedy',
       'overall_genre_proportions_Action',
       'overall_genre_proportions_Adventure',
       'overall_genre_proportions_Crime', 'overall_genre_proportions_Thriller',
       'overall_genre_proportions_Romance', 'overall_genre_proportions_Sci-Fi',
       'overall_genre_proportions_Fantasy',
       'overall_genre_proportions_Mystery', 'overall_genre_proportions_Horror',
       'overall_genre_proportions_Animation',
       'overall_genre_proportions_Family',
       'overall_genre_proportions_Biography', 'overall_genre_proportions_War'],
      dtype='object')

In [20]:
top15genre = top15.str.replace("overall_genre_proportions_", "")
top15genre

Index(['Drama', 'Comedy', 'Action', 'Adventure', 'Crime', 'Thriller',
       'Romance', 'Sci-Fi', 'Fantasy', 'Mystery', 'Horror', 'Animation',
       'Family', 'Biography', 'War'],
      dtype='object')

In [21]:
bucket_prefixes = ['bucket_0_1', 'bucket_2_3', 'bucket_4_5', 'overall']
allothers = []  # To collect columns that should be dropped

for bucket in bucket_prefixes:
    other = []
    curr_genres = [col for col in genre_columns if col.startswith(bucket)]
    for curr_genre in curr_genres:
        if curr_genre.split("_")[-1] not in top15genre:
            other.append(curr_genre)
    combined_features[bucket + "_genre_proportions_Other"] = combined_features[other].sum(axis=1)
    allothers.extend(other)

print(combined_features.shape)  # Check the shape of the updated dataframe
print(f"Number of columns that can be dropped: {len(allothers)}")

# Now you can drop these columns if needed:


(31972560, 194)
Number of columns that can be dropped: 47


In [22]:
combined_features.drop(columns=allothers, inplace=True)

In [23]:
combined_features.shape

(31972560, 147)

In [93]:
for col in combined_features.columns:
    print(col)
    print(combined_features[col].dtype)
    #print(combined_data[col].isnull().sum())
    #|print(combined_features[col].describe())


rating
float64
total_ratings
int64
rating_avg
float64
rating_std
float64
bucket_0_1_movie_ids
object
bucket_0_1_count
float64
bucket_0_1_year_avg
float64
bucket_0_1_year_var
float64
bucket_0_1_runtime_avg
float64
bucket_0_1_runtime_var
float64
bucket_0_1_rating_avg
float64
bucket_0_1_rating_var
float64
bucket_0_1_numvotes_avg
float64
bucket_0_1_numvotes_var
float64
bucket_0_1_top_directors
object
bucket_0_1_top_writers
object
bucket_2_3_movie_ids
object
bucket_2_3_count
float64
bucket_2_3_year_avg
float64
bucket_2_3_year_var
float64
bucket_2_3_runtime_avg
float64
bucket_2_3_runtime_var
float64
bucket_2_3_rating_avg
float64
bucket_2_3_rating_var
float64
bucket_2_3_numvotes_avg
float64
bucket_2_3_numvotes_var
float64
bucket_2_3_top_directors
object
bucket_2_3_top_writers
object
bucket_4_5_movie_ids
object
bucket_4_5_count
float64
bucket_4_5_year_avg
float64
bucket_4_5_year_var
float64
bucket_4_5_runtime_avg
float64
bucket_4_5_runtime_var
float64
bucket_4_5_rating_avg
float64
bucket_4_5_r

In [24]:
combined_features["tconst_str"] = "tt" + combined_features["tconst"].astype(str)
combined_features["tconst_str"]

0            tt114388
1            tt113627
2            tt112682
3            tt115012
4            tt114746
              ...    
31972555     tt446029
31972556    tt1020558
31972557    tt1666186
31972558    tt1285016
31972559    tt1532503
Name: tconst_str, Length: 31972560, dtype: object

In [25]:
combined_features.drop(columns=["tconst"], inplace=True)

In [26]:
combined_features.to_pickle("./Datasets/Features3.10/combined_features2.0med.pkl")

In [102]:
combined_features.columns

Index(['rating', 'total_ratings', 'rating_avg', 'rating_std',
       'bucket_0_1_movie_ids', 'bucket_0_1_count', 'bucket_0_1_year_avg',
       'bucket_0_1_year_var', 'bucket_0_1_runtime_avg',
       'bucket_0_1_runtime_var',
       ...
       'genres', 'averageRating', 'numVotes', 'directors', 'writers',
       'bucket_0_1_genre_proportions_Other',
       'bucket_2_3_genre_proportions_Other',
       'bucket_4_5_genre_proportions_Other', 'overall_genre_proportions_Other',
       'tconst_str'],
      dtype='object', length=147)

In [106]:
for column in combined_features.columns:
    print(column)
    print(combined_features[column][0])
    print(combined_features[column].dtype)

rating
4.0
float64
total_ratings
141
int64
rating_avg
3.5319148936170213
float64
rating_std
1.5378704733185982
float64
bucket_0_1_movie_ids
['tt0056172', 'tt0059113', 'tt0071853', 'tt0079367', 'tt0079470', 'tt0088933', 'tt0090830', 'tt0092699', 'tt0093389', 'tt0093779', 'tt0095953', 'tt0097441', 'tt0104257', 'tt0109045', 'tt0112740', 'tt0112818', 'tt0113627', 'tt0117318', 'tt0117631', 'tt0118799', 'tt0119822', 'tt0120663', 'tt0120716', 'tt0128853', 'tt0163187', 'tt0166896']
object
bucket_0_1_count
26.0
float64
bucket_0_1_year_avg
1989.2307692307693
float64
bucket_0_1_year_var
103.46461538461534
float64
bucket_0_1_runtime_avg
126.73076923076923
float64
bucket_0_1_runtime_var
962.5246153846153
float64
bucket_0_1_rating_avg
7.511538461538461
float64
bucket_0_1_rating_var
0.39786153846153843
float64
bucket_0_1_numvotes_avg
220789.07692307694
float64
bucket_0_1_numvotes_var
41145849087.273834
float64
bucket_0_1_top_directors
['nm0000180', 'nm0001402', 'nm0000985']
object
bucket_0_1_top_writ

### Do Further Feature Reduction

In [27]:
var_columns = combined_features.columns[combined_features.columns.str.contains("var")]
var_columns

Index(['bucket_0_1_year_var', 'bucket_0_1_runtime_var',
       'bucket_0_1_rating_var', 'bucket_0_1_numvotes_var',
       'bucket_2_3_year_var', 'bucket_2_3_runtime_var',
       'bucket_2_3_rating_var', 'bucket_2_3_numvotes_var',
       'bucket_4_5_year_var', 'bucket_4_5_runtime_var',
       'bucket_4_5_rating_var', 'bucket_4_5_numvotes_var', 'overall_year_var',
       'overall_runtime_var', 'overall_rating_var', 'overall_numvotes_var'],
      dtype='object')

In [28]:
combined_features.drop(var_columns, inplace = True, axis = 1)
combined_features

Unnamed: 0,rating,total_ratings,rating_avg,rating_std,bucket_0_1_movie_ids,bucket_0_1_count,bucket_0_1_year_avg,bucket_0_1_runtime_avg,bucket_0_1_rating_avg,bucket_0_1_numvotes_avg,...,genres,averageRating,numVotes,directors,writers,bucket_0_1_genre_proportions_Other,bucket_2_3_genre_proportions_Other,bucket_4_5_genre_proportions_Other,overall_genre_proportions_Other,tconst_str
0,4.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,126.730769,7.511538,220789.076923,...,"[Drama, Romance]",7.7,130226,[nm0000487],"[nm0000807, nm0000668]",0.062500,0.038961,0.047393,0.048295,tt114388
1,1.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,126.730769,7.511538,220789.076923,...,"[Drama, Romance]",7.5,139377,[nm0001214],"[nm3840714, nm0001214]",0.062500,0.038961,0.047393,0.048295,tt113627
2,2.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,126.730769,7.511538,220789.076923,...,"[Adventure, Drama, Fantasy]",7.5,72979,"[nm0001988, nm0000466]","[nm0012496, nm0000466, nm0001988, nm0491011]",0.062500,0.038961,0.047393,0.048295,tt112682
3,5.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,126.730769,7.511538,220789.076923,...,"[Crime, Drama, History]",7.1,6155,[nm0955443],"[nm0270819, nm0910841, nm0944481]",0.062500,0.038961,0.047393,0.048295,tt115012
4,5.0,141,3.531915,1.537870,"[tt0056172, tt0059113, tt0071853, tt0079367, t...",26.0,1989.230769,126.730769,7.511538,220789.076923,...,"[Mystery, Sci-Fi, Thriller]",8.0,660616,[nm0000416],"[nm0003408, nm0672459, nm0672466]",0.062500,0.038961,0.047393,0.048295,tt114746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31972555,4.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,96.952381,5.447619,126668.428571,...,"[Action, Comedy, Fantasy]",7.5,478444,[nm0942367],"[nm0045209, nm0942367, nm1854069]",0.074074,0.021583,0.034582,0.032841,tt446029
31972556,1.0,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,96.952381,5.447619,126668.428571,...,"[Action, Drama, History]",6.3,87775,[nm0551076],[nm0551076],0.074074,0.021583,0.034582,0.032841,tt1020558
31972557,0.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,96.952381,5.447619,126668.428571,...,"[Comedy, Horror]",3.4,52852,"[nm0294997, nm0783536]","[nm0294997, nm0783536]",0.074074,0.021583,0.034582,0.032841,tt1666186
31972558,3.5,236,3.595339,1.243887,"[tt0063462, tt0080761, tt0087800, tt0113198, t...",21.0,1999.904762,96.952381,5.447619,126668.428571,...,"[Biography, Drama]",7.8,783018,[nm0000399],"[nm0815070, nm0583826]",0.074074,0.021583,0.034582,0.032841,tt1285016


In [29]:
combined_features.to_pickle("./Datasets/Features3.10/combined_features2.0small.pkl")

In [32]:
combined_features.to_parquet("./Datasets/Features3.10/combined_features2.0small.parquet")

: 

### Convert Into Vaex compatible format

In [31]:
for col in combined_features.columns:
    print(col)
    print(combined_features[col].dtype)
    print(combined_features[col][0])

rating
float64
4.0
total_ratings
int64
141
rating_avg
float64
3.5319148936170213
rating_std
float64
1.5378704733185982
bucket_0_1_movie_ids
object
['tt0056172', 'tt0059113', 'tt0071853', 'tt0079367', 'tt0079470', 'tt0088933', 'tt0090830', 'tt0092699', 'tt0093389', 'tt0093779', 'tt0095953', 'tt0097441', 'tt0104257', 'tt0109045', 'tt0112740', 'tt0112818', 'tt0113627', 'tt0117318', 'tt0117631', 'tt0118799', 'tt0119822', 'tt0120663', 'tt0120716', 'tt0128853', 'tt0163187', 'tt0166896']
bucket_0_1_count
float64
26.0
bucket_0_1_year_avg
float64
1989.2307692307693
bucket_0_1_runtime_avg
float64
126.73076923076923
bucket_0_1_rating_avg
float64
7.511538461538461
bucket_0_1_numvotes_avg
float64
220789.07692307694
bucket_0_1_top_directors
object
['nm0000180', 'nm0001402', 'nm0000985']
bucket_0_1_top_writers
object
['nm0004122', 'nm0001385', 'nm0001589']
bucket_2_3_movie_ids
object
['tt0034583', 'tt0042546', 'tt0045793', 'tt0051459', 'tt0061512', 'tt0064115', 'tt0085244', 'tt0086190', 'tt0087995', 

### Load in Pkl

In [114]:
filtered = pd.read_pickle("./Datasets/Features/combined_features2.0med.pkl")


In [115]:
drop_proportion = 0.5
# Get original dataset size
original_size = len(filtered)
print(f"Original dataset: {original_size} rows and {len(df.columns)} columns")

# Calculate how many rows to drop
rows_to_drop = int(original_size * (1 - drop_proportion))

# Generate indices to drop (random selection)
import numpy as np
drop_indices = np.random.choice(filtered.index, size=rows_to_drop, replace=False)

# Drop rows in-place to avoid creating a full copy
filtered.drop(drop_indices, inplace=True)
    

Original dataset: 31972560 rows and 186 columns


In [116]:
len(filtered)


15986280

In [117]:
filtered.to_pickle("./Datasets/Features/combined_features2.0small.pkl")

In [122]:
filtered["bucket_0_1_movie_ids"].dtype

dtype('O')

In [124]:
filtered["bucket_0_1_movie_ids"][0]

list

In [131]:
empty_values = filtered["bucket_0_1_movie_ids"].isna()

In [136]:
object_columns = filtered.select_dtypes(include="object").columns


In [137]:
object_columns

Index(['bucket_0_1_movie_ids', 'bucket_0_1_top_directors',
       'bucket_0_1_top_writers', 'bucket_2_3_movie_ids',
       'bucket_2_3_top_directors', 'bucket_2_3_top_writers',
       'bucket_4_5_movie_ids', 'bucket_4_5_top_directors',
       'bucket_4_5_top_writers', 'overall_movie_ids', 'overall_top_directors',
       'overall_top_writers', 'startYear', 'runtimeMinutes', 'genres',
       'directors', 'writers', 'tconst_str'],
      dtype='object')

### Verify that Object Columns are Consistent

In [140]:
filtered["runtimeMinutes"]

0           136
1           111
2           112
4           129
6           122
           ... 
31972550    109
31972553     98
31972556     97
31972557     82
31972559    105
Name: runtimeMinutes, Length: 15986280, dtype: object

In [141]:
filtered["runtimeMinutes"] = pd.to_numeric(filtered["runtimeMinutes"], errors='coerce').astype("Int64")

### Divide Columns into Types 

In [None]:
### How would I do this
### Divide into object columns
    ### Divide into strings
    ### Divide into 
### Divide into Non-object

In [143]:
for col in object_columns:
    if col not in filtered.columns:
        print(f"Column '{col}' not found in DataFrame.")
        continue
    
    # Get unique types in the column (excluding NaNs)
    unique_types = set(filtered[col].dropna().map(type))
#    if len(unique_types) == 1:
#        print(f"✅ Column '{col}' is consistent. Type: {list(unique_types)[0].__name__}")
    if len(unique_types) > 1:
        print(f"❌ Column '{col}' is inconsistent. Found types: {unique_types}")
    else:
        print(unique_types)

{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'str'>}
{<class 'int'>}
{<class 'list'>}
{<class 'list'>}
{<class 'list'>}
{<class 'str'>}


In [144]:
list_columns = []
string_columns = []

for col in object_columns:
    if col not in filtered.columns:
        print(f"Column '{col}' not found in DataFrame.")
        continue
    
    # Get unique types in the column (excluding NaNs)
    unique_types = set(filtered[col].dropna().map(type))
    
    # Categorize based on type
    if len(unique_types) == 1:  # Only process consistent type columns
        col_type = list(unique_types)[0]
        if col_type == list:
            list_columns.append(col)
        elif col_type == str:
            string_columns.append(col)
    else:
        print(f"❌ Column '{col}' is inconsistent. Found types: {unique_types}")

print("\nList columns:", list_columns)
print("\nString columns:", string_columns)


List columns: ['bucket_0_1_movie_ids', 'bucket_0_1_top_directors', 'bucket_0_1_top_writers', 'bucket_2_3_movie_ids', 'bucket_2_3_top_directors', 'bucket_2_3_top_writers', 'bucket_4_5_movie_ids', 'bucket_4_5_top_directors', 'bucket_4_5_top_writers', 'overall_movie_ids', 'overall_top_directors', 'overall_top_writers', 'genres', 'directors', 'writers']

String columns: ['startYear', 'tconst_str']


### Handle StartYear

In [146]:
filtered["startYear"] = pd.to_numeric(filtered["startYear"], errors='coerce').astype("Int64")
filtered["startYear"]

0           1995
1           1995
2           1995
4           1995
6           1995
            ... 
31972550    2009
31972553    2010
31972556    2010
31972557    2010
31972559    2010
Name: startYear, Length: 15986280, dtype: Int64

### To Parquet

In [147]:
filtered.to_parquet("./Datasets/Features/combined_features2.0small.parquet")

: 

### Define Schema for Parquet
Do this if there is an inconsistency