## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from scipy.stats.mstats import winsorize
from datetime import datetime
import re
import string
from nltk.corpus import stopwords
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.cluster import KMeans

## Functions - Data Preprocessing

In [2]:
def extract_key_values(df, col):
    """
    Extracts and flattens values from a dictionary-like structure in a DataFrame column.
    :param df: DataFrame containing the column to explode.
    :param col: Column name to be processed.
    :return: List of flattened values.
    """
    return [name for item_list in df[col].dropna().map(eval).tolist() for item in item_list for name in item['name']]

def top_k_replace(df, col, k, val):
    """
    Replaces all but the top k elements in a DataFrame column with a specified value.
    :param df: DataFrame containing the column.
    :param col: Column name to be processed.
    :param k: Number of top elements to keep.
    :param val: Value to replace other elements with.
    """
    top = df[col].value_counts().nlargest(k).index
    df[col] = df[col].where(df[col].isin(top), other=val)

def zero_replace(df, col, val):
    """
    Replaces zero values in a DataFrame column with a specified value.
    :param df: DataFrame containing the column.
    :param col: Column name where zeros are to be replaced.
    :param val: Value to replace zeros with.
    """
    df[col] = df[col].replace(0, val)

def extract_key_values_list(df, col):
    """
    Similar to extract_key_values but returns a 2-dimensional list of values.
    :param df: DataFrame containing the column to explode.
    :param col: Column name to be processed.
    :return: 2D list of values.
    """
    return df[col].dropna().map(lambda x: [item['name'] for item in eval(x)]).tolist()

# splits columns into left and right columns given delimiter
# input: df->dataframe, col->string of column name, lcol->left column name, rcol->right column string name, delim->delimiter 
# output: updated dataframe with the left and right columns included
def column_split(df, col, lcol, rcol, delim):
    ls_lcol = []
    ls_rcol = []
    for i in df[col]:
        sep = i.split(delim)
        lval = np.int_(sep[0])
        rval = np.int_(sep[1])
        ls_lcol.append(lval)
        ls_rcol.append(rval)
    df[lcol] = ls_lcol
    df[rcol] = ls_rcol
    return df
## merges train and metadata into a single dataset, ditto with test
def get_datasets(m):
    train = pd.read_csv("train.csv")
    train = column_split(train, 'userId_movieId', 'user_id','id', '_')
    train = train.merge(m, how = 'left', on = 'id')
    train.fillna(False, inplace = True)
    
    test = pd.read_csv("test.csv")
    test = column_split(test, "userId_movieId", "user_id", "id", "_")
    test = test.merge(m, how = 'left', on = 'id')
    test.fillna(False, inplace = True)
    return (test, train)

## Natural Language Processing

In [3]:
def preprocess(text):
    text = text.lower()
    text=text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text
# stopword removal
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)


## collaborative filtering functions
# returns list of once one-hot encoded column names based on given regular expression
def get_column_names(df, regexpr):
    col_names = []
    for col in df.columns:
        x = re.findall(regexpr, col)
        if x:
            col_names.append(x[0])
    return col_names

# input: df->dataframe, col->name of dataframe, col_names->list of column names
# output: dataframe of averages based on col_names
# goal: constructs dataframe of name col with columns col_names with data from the mean of the ratings partitioned by col_names
def get_ratings(df, col, col_names):
    ratings = {}
    for col_name in col_names:
        avg_rating_name = df[df[col_name] == True]['rating'].mean()
        if np.isnan(avg_rating_name):
            avg_rating_name = 0 # 0 cuz no data
        ratings[col_name] = avg_rating_name
    return pd.DataFrame(pd.Series(ratings, name = col)).astype(float)


# merges data (again) with the collaborative filtering dataframe
def pref_merge(df, pref_df):
    df = df.merge(pref_df, how = 'left', on = 'user_id')
    df.fillna(False, inplace = True)
    return df


## modeling functions
## evaluates performance
def model_evaluate(truth, pred, model_name):
    print("PERFORMANCE OF {0}".format(model_name))
    rmse = np.sqrt(mean_squared_error(truth, pred))
    print("RMSE on testing set = ", rmse)
    print("(mean normalised) RMSE on testing set = ", rmse / truth.mean())
    print("(minmax normalised) RMSE on testing set = ", rmse / (truth.max() - truth.min()))
    print("\n")
    return rmse

## runs all the models
def run_models(test, train):
    X_train, X_test, y_train, y_test = train_test_split(
        train.drop('rating',axis=1, inplace = False),
        train['rating'],
        test_size=1/6.0,
        random_state=0)
    
    
    
    ## 'vanilla' models refer to models with all default hyperparameters
    # models with names consisting of just the type of model used refers to models with hyperparameters found through grid search
    # 'bayesian' models refer to models with hyperparameters found through bayesian optimisation (or at least close to it)
    models = [LinearRegression(),
              DecisionTreeRegressor(max_depth = 10, max_leaf_nodes = 64, min_samples_leaf = 4, min_samples_split = 64, 
                                    splitter = 'best'),
              DecisionTreeRegressor(max_depth = 28, max_leaf_nodes = 64, min_samples_leaf = 21, min_samples_split = 96,
                                    splitter = 'best'),
              RandomForestRegressor(n_estimators = 100, bootstrap = True, max_samples = 1000,
                                    max_depth = 10, max_leaf_nodes = 64,
                                    min_samples_leaf = 4, min_samples_split = 64),
              RandomForestRegressor(n_estimators = 139, bootstrap = True, max_samples = 1906,
                                    max_depth = 19, max_leaf_nodes = 95,
                                    min_samples_leaf = 3, min_samples_split = 13),
              HistGradientBoostingRegressor(),
              HistGradientBoostingRegressor(learning_rate = 0.125, max_iter = 100, max_bins = 255,
                                            max_leaf_nodes = 63, min_samples_leaf = 20, max_depth = 10),
              
              # note that this is not exactly optimal, but it gets close.
              HistGradientBoostingRegressor(learning_rate = 0.10877288751758378, max_iter = 258, max_bins = 255,
                                            max_leaf_nodes = 50, min_samples_leaf = 47, max_depth = 18)
             ]
    model_names = ["LINEAR REGRESSION",
                    "DECISION TREE",
                    "BAYESIAN DECISION TREE",
                    "RANDOM FOREST",
                    "BAYESIAN RANDOM FOREST",
                    "VANILLA GRADIENT BOOSTING",
                    "GRADIENT BOOSTING",
                    "BAYESIAN GRADIENT BOOSTING"]
    
    (min_rmse, mindex) = ((2**63) -1, 0)
    
    for i in range(len(models)):
        models[i].fit(X_train, y_train)
        pred = models[i].predict(X_test)
        rmse = model_evaluate(y_test, pred, model_names[i])
        if (rmse < min_rmse):
            (min_rmse, mindex) = (rmse, i)
    print("BEST PERFORMING MODEL:\n\t{0} WITH RMSE {1}".format(model_names[mindex], min_rmse))       
    return (models[mindex].predict(test), min_rmse)

## Content Filtering

In [4]:
## clean setup each time
metadata = pd.read_csv("movies_metadata.csv")

### CLEAN METADATA ###
# 1. remove duplicates
metadata.drop_duplicates(subset = ['id'], inplace = True)

# 2. map removal, top k replace
k = 10
for col in ['spoken_languages', 'production_companies', 'production_countries']:
    metadata[col] = pd.DataFrame(extract_key_values(metadata, col))
    misc = 'Other' 
    top_k_replace(metadata, col, k, misc)
misc = 'Foreign'
top_k_replace(metadata, 'original_language', k, misc)
        
# 3. zero replace
columns = ['runtime', 'vote_count', 'vote_average']
for col in columns:
    zero_replace(metadata, col, metadata[col].median() // 1)
    
# 4. making revenue and budget be based off of thresholds
winsorize(metadata['budget'], limits=[0.01, 0.05])
winsorize(metadata['revenue'], limits=[0.2875, 0.05])
    
# 5. one-hot encoding, key-value extraction
g = extract_key_values_list(metadata, 'genres')
genres = np.unique(np.array(extract_key_values(metadata, 'genres')))
for genre in genres:
    metadata['is_' + genre] = [genre in l for l in g]
col = ['original_language', 'production_companies', 'production_countries', 'spoken_languages']
metadata = pd.get_dummies(data = metadata, columns = col)
    
# 6. updating year
# magic string date "2001-08-03" is the median date (50th percentile) with untreated NaN
dates = pd.DataFrame([datetime.strptime("2001-08-03" if (type(date) != str) else date, "%Y-%m-%d") for date in metadata['release_date']])[0]
metadata['year'] = pd.DataFrame([date.year for date in dates])[0]
metadata['month'] = pd.DataFrame([date.month for date in dates])[0]
metadata['decade'] = pd.DataFrame([(date.year // 10) * 10 for date in dates])[0]

# 7. make booleans integers
metadata.fillna(False, inplace = True)
boolies = list(metadata.select_dtypes(bool).columns)
metadata[boolies] = metadata[boolies].astype(bool)
    
# 8. data preprocessing of the 'overview' column
text = (metadata['overview'].astype(str)).apply(lambda x: stopword(preprocess(x)))
tfidf = TfidfVectorizer(max_features = 20000, ngram_range = (1, 5))

# add text data to metadata (now numeric), get cluster labels
tfidf.fit(text)
kmeans = KMeans(n_clusters = 9, init = 'k-means++', random_state=42, n_init = 10).fit(csr_matrix(hstack([tfidf.transform(text)])))
label = kmeans.labels_
metadata['label'] = label

# 9. columns to be removed
to_remove = ['belongs_to_collection', 'homepage', 'imdb_id', 'original_title', 'overview',
             'poster_path','status','tagline','title','video', 'release_date', 'genres']
metadata.drop(labels = to_remove, axis = 1, inplace = True)

  metadata.fillna(False, inplace = True)


## Collaborative Filtering

In [5]:
## ratings from all users based on genre and production company
(test, train) = get_datasets(metadata)
genre_avg = 'genre_averages'
production_avg = 'production_averages'
genres = get_column_names(train, "^is_.*")
prefer_genres = ["prefer_" + genre[len("is_"):] for genre in genres]
productions = get_column_names(train, "^production_companies_.*")
prefer_productions = [production[len("production_companies_"):] + "_pref" for production in productions]

## dataframe of genre and production averages across entire dataset
genre_avg_rating = get_ratings(train, genre_avg, genres)
production_avg_rating = get_ratings(train, production_avg, productions)

  train.fillna(False, inplace = True)
  test.fillna(False, inplace = True)


In [6]:
## per user
user_genres_pref_df = None
user_production_pref_df = None
user_ids = np.unique(train['user_id'])

## loop to get movie information from movie id, which has different rating
for i in user_ids:
    df = train[train['user_id'] == i]
    user_genre_avg_rating = get_ratings(df, genre_avg, genres)
    user_production_avg_rating = get_ratings(df, production_avg, productions)
    
    # copying over genre row
    genre_prefs = pd.Series(
    np.where(user_genre_avg_rating[genre_avg] > 0,
             user_genre_avg_rating[genre_avg],
             genre_avg_rating[genre_avg]), index=prefer_genres
    )
    genre_prefs_list = pd.DataFrame([genre_prefs for i in range(df.shape[0])])
    
    # add to new updated dataframe
    genre_prefs_dict = {g: genre_prefs_list.get(g, 0) for g in prefer_genres}
    genre_df = pd.DataFrame(genre_prefs_dict, index=[0])
    genre_df['user_id'] = i
    
    # add to new dataframe for all users
    user_genres_pref_df = pd.concat([user_genres_pref_df, genre_df])

    # same with productions
    production_prefs = pd.Series(
    np.where(user_production_avg_rating[production_avg] > 0,
             user_production_avg_rating[production_avg],
             production_avg_rating[production_avg]),
    index=prefer_productions
    )
    production_prefs_list = pd.DataFrame([production_prefs for i in range(df.shape[0])])

    production_prefs_dict = {p: production_prefs_list.get(p, 0) for p in prefer_productions}
    production_df = pd.DataFrame(production_prefs_dict, index=[0])
    production_df['user_id'] = i
    
    # add to new dataframe for all users
    user_production_pref_df = pd.concat([user_production_pref_df, production_df])
    
## merge with the metadata (test/train partitioned)
train = pref_merge(pref_merge(train, user_genres_pref_df), user_production_pref_df)
test = pref_merge(pref_merge(test, user_genres_pref_df), user_production_pref_df)

## Run all the models

In [7]:
(best_pred, rmse) = run_models(test, train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF LINEAR REGRESSION
RMSE on testing set =  0.17826377617103392
(mean normalised) RMSE on testing set =  0.25234820992834733
(minmax normalised) RMSE on testing set =  0.1980708624122599




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF DECISION TREE
RMSE on testing set =  0.1789412023265318
(mean normalised) RMSE on testing set =  0.2533071668256505
(minmax normalised) RMSE on testing set =  0.19882355814059088




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF BAYESIAN DECISION TREE
RMSE on testing set =  0.17894208030150094
(mean normalised) RMSE on testing set =  0.25330840967720786
(minmax normalised) RMSE on testing set =  0.19882453366833436




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF RANDOM FOREST
RMSE on testing set =  0.17783907137567267
(mean normalised) RMSE on testing set =  0.2517470025649704
(minmax normalised) RMSE on testing set =  0.19759896819519185




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF BAYESIAN RANDOM FOREST
RMSE on testing set =  0.17612335902628973
(mean normalised) RMSE on testing set =  0.2493182593316657
(minmax normalised) RMSE on testing set =  0.1956926211403219




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF VANILLA GRADIENT BOOSTING
RMSE on testing set =  0.17225578134079417
(mean normalised) RMSE on testing set =  0.24384335956988107
(minmax normalised) RMSE on testing set =  0.1913953126008824




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF GRADIENT BOOSTING
RMSE on testing set =  0.17110115952634514
(mean normalised) RMSE on testing set =  0.24220888982914762
(minmax normalised) RMSE on testing set =  0.1901123994737168




  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


PERFORMANCE OF BAYESIAN GRADIENT BOOSTING
RMSE on testing set =  0.1697073628206302
(mean normalised) RMSE on testing set =  0.24023584678447582
(minmax normalised) RMSE on testing set =  0.18856373646736688


BEST PERFORMING MODEL:
	BAYESIAN GRADIENT BOOSTING WITH RMSE 0.1697073628206302


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


## Submission

In [9]:
def prepare_submission(test_df, predictions):
    """
    Prepares the submission DataFrame.
    :param test_df: DataFrame containing the test data.
    :param predictions: Predictions to be included in the submission.
    :return: Submission DataFrame.
    """
    if not np.any(pd.isnull(predictions)):
        submission = pd.DataFrame(test_df[['userId_movieId']])
        submission['rating'] = predictions
        print(f"Submission shape: {submission.shape}, Best RMSE: {rmse}")
        return submission
    else:
        print("Predictions contain null values. Submission not created.")
        return None

submission = prepare_submission(test, best_pred)


Submission shape: (30002, 2), Best RMSE: 0.1697073628206302


In [10]:
submission

Unnamed: 0,userId_movieId,rating
0,469_2124,0.663765
1,439_3753,0.761249
2,522_1682,0.938248
3,429_1217,0.917605
4,71_1210,0.755397
...,...,...
29997,305_2599,0.696555
29998,22_2109,0.703634
29999,534_2947,0.810808
30000,558_4085,0.718926
