In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
# Load Data
def load_data():
    train = pd.read_csv('train.dat', sep=' ', encoding='latin1')
    test = pd.read_csv('test.dat', sep=' ', encoding='latin1')
    movie_genres = pd.read_csv('movie_genres.dat', sep='\t', encoding='latin1')
    movie_directors = pd.read_csv('movie_directors.dat', sep='\t', encoding='latin1')
    movie_actors = pd.read_csv('movie_actors.dat', sep='\t', encoding='latin1')
    movie_tags = pd.read_csv('movie_tags.dat', sep='\t', encoding='latin1')
    user_taggedmovies = pd.read_csv('user_taggedmovies.dat', sep=' ', encoding='latin1')
    return train, test, movie_genres, movie_directors, movie_actors, movie_tags, user_taggedmovies


In [3]:
# Preprocessing: Merge Data and Create Features
def preprocess_data(train, test, movie_genres, movie_directors, movie_actors, movie_tags, user_taggedmovies):
    # Create a movie-level feature set
    movie_features = movie_genres.groupby('movieID')['genre'].apply(lambda x: ','.join(x)).reset_index()
    movie_features = movie_features.merge(movie_directors[['movieID', 'directorID']], on='movieID', how='left')
    movie_features = movie_features.merge(movie_tags.groupby('movieID')['tagWeight'].mean().reset_index(), on='movieID', how='left')
    
    # Add actor ranking (average for each movie)
    actor_ranking = movie_actors.groupby('movieID')['ranking'].mean().reset_index()
    actor_ranking.columns = ['movieID', 'actorRanking']
    movie_features = movie_features.merge(actor_ranking, on='movieID', how='left')
    
    # Merge with train and test data
    train = train.merge(movie_features, on='movieID', how='left')
    test = test.merge(movie_features, on='movieID', how='left')
    
    # User-level features: tags assigned to movies by users
    user_features = user_taggedmovies.groupby('userID')['tagID'].nunique().reset_index()
    user_features.columns = ['userID', 'uniqueTags']
    train = train.merge(user_features, on='userID', how='left')
    test = test.merge(user_features, on='userID', how='left')
    
    return train, test

In [4]:
def feature_engineering(train, test):
    # Average user and movie ratings
    user_avg_rating = train.groupby('userID')['rating'].mean().reset_index()
    user_avg_rating.columns = ['userID', 'userAvgRating']
    movie_avg_rating = train.groupby('movieID')['rating'].mean().reset_index()
    movie_avg_rating.columns = ['movieID', 'movieAvgRating']
    
    # Merge these features into train and test
    train = train.merge(user_avg_rating, on='userID', how='left')
    train = train.merge(movie_avg_rating, on='movieID', how='left')
    test = test.merge(user_avg_rating, on='userID', how='left')
    test = test.merge(movie_avg_rating, on='movieID', how='left')

    # Replace missing values in test with global averages
    global_avg_rating = train['rating'].mean()
    test['userAvgRating'].fillna(global_avg_rating, inplace=True)
    test['movieAvgRating'].fillna(global_avg_rating, inplace=True)
    
    return train, test


In [53]:
# Model Training and Prediction
def train_and_predict(train, test):
    # Prepare features and target
    X = train.drop(['rating'], axis=1).select_dtypes(include=np.number)
    y = train['rating']
    test_X = test.select_dtypes(include=np.number)
    
    # Train-test split for evaluation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = XGBRegressor(objective='reg:squarederror', n_estimators=200, max_depth=6, learning_rate=0.5, reg_alpha=1, reg_lambda=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Validate the model
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f'Validation RMSE: {rmse}')
    
    # Predict on the test set
    test_preds = model.predict(test_X)
    return test_preds

In [6]:
train, test, movie_genres, movie_directors, movie_actors, movie_tags, user_taggedmovies = load_data()

In [7]:
train, test = preprocess_data(train, test, movie_genres, movie_directors, movie_actors, movie_tags, user_taggedmovies)

In [8]:
train

Unnamed: 0,userID,movieID,rating,genre,directorID,tagWeight,actorRanking,uniqueTags
0,75,3,1.0,"Comedy,Romance",donald_petrie,1.166667,,1
1,75,32,4.5,"Sci-Fi,Thriller",siddharth_randeria,2.588235,,1
2,75,110,4.0,"Action,Drama,War",mel_gibson,1.854545,,1
3,75,163,4.0,"Action,Romance,Thriller",robert_rodriguez,1.583333,,1
4,75,165,4.5,"Action,Crime,Thriller",john_mctiernan,2.125000,,1
...,...,...,...,...,...,...,...,...
641694,71534,42900,4.0,"Comedy,Crime,Drama,Thriller",roman_polanski,1.000000,,30
641695,71534,44555,4.0,Drama,florian_henckeldonnersmarck,1.600000,,30
641696,71534,46578,4.0,"Comedy,Drama",1167865-jonathan_dayton,1.848485,,30
641697,71534,61075,5.0,"Drama,Romance",isabel_coixet,1.000000,,30


In [9]:
train, test = feature_engineering(train, test)

In [10]:
train

Unnamed: 0,userID,movieID,rating,genre,directorID,tagWeight,actorRanking,uniqueTags,userAvgRating,movieAvgRating
0,75,3,1.0,"Comedy,Romance",donald_petrie,1.166667,,1,3.464286,2.851955
1,75,32,4.5,"Sci-Fi,Thriller",siddharth_randeria,2.588235,,1,3.464286,4.012994
2,75,110,4.0,"Action,Drama,War",mel_gibson,1.854545,,1,3.464286,3.843983
3,75,163,4.0,"Action,Romance,Thriller",robert_rodriguez,1.583333,,1,3.464286,3.362500
4,75,165,4.5,"Action,Crime,Thriller",john_mctiernan,2.125000,,1,3.464286,3.371262
...,...,...,...,...,...,...,...,...,...,...
641694,71534,42900,4.0,"Comedy,Crime,Drama,Thriller",roman_polanski,1.000000,,30,4.148438,4.035714
641695,71534,44555,4.0,Drama,florian_henckeldonnersmarck,1.600000,,30,4.148438,4.236842
641696,71534,46578,4.0,"Comedy,Drama",1167865-jonathan_dayton,1.848485,,30,4.148438,3.971681
641697,71534,61075,5.0,"Drama,Romance",isabel_coixet,1.000000,,30,4.148438,3.681818


In [54]:
test_preds = train_and_predict(train, test)

Validation RMSE: 0.7818326753747543


In [55]:
# Save predictions
with open('example_entry.dat', 'w') as f:
    for pred in test_preds:
        f.write(f"{pred}\n")
print("Predictions saved to 'predictions.dat'.")


Predictions saved to 'predictions.dat'.
