### Refactor

### Single latent factor

In [None]:
class Hybrid(Model):
    def __init__(self, alpha, n_factors, lambda_reg=0):
        self.lambda_reg = lambda_reg
        self.alpha = alpha
        self.n_factors = n_factors
        
    def predict_season_scores(self, season_scores):
        predicted_score_table = pd.DataFrame(self.skater_scores.values @ self.event_scores.values.T + self.baseline,
                                            index=self.skater_scores.index,
                                            columns=self.event_scores.index)
        
        predicted_score_stacked = predicted_score_table.stack()
        season_skater_event_index = season_scores.set_index(['name', 'event']).index
        return predicted_score_stacked.loc[season_skater_event_index].values
        
    def fit(self, season_scores, n_iter, seed=42, verbose=False):
        season_pivot = pd.pivot_table(season_scores[['name', 'event', 'score']], values='score', index='name', columns='event')
        skater_names = list(season_pivot.index)
        event_names = list(season_pivot.columns)
        true_scores = season_pivot.values
        
        random_state = np.random.RandomState(seed=seed)
        self.skater_scores = random_state.random_sample((len(skater_names), self.n_factors))
        self.event_scores = random_state.random_sample((self.n_factors, len(event_names)))
        self.baseline = random_state.random_sample()
        
        for i in range(n_iter):
            predicted_scores = self.skater_scores @ self.event_scores + self.baseline
            diff = predicted_scores - true_scores
            
            for k in range(self.n_factors):
                skater_score_k = self.skater_scores[:, [k]]
                event_score_k = self.event_scores[[k], :]
                
                baseline_gradient = np.nansum(diff)                
                skater_gradients = np.nansum(diff * event_score_k, axis=1, keepdims=True) + self.lambda_reg * skater_score_k
                event_gradients = np.nansum(diff * skater_score_k, axis=0, keepdims=True) + self.lambda_reg * event_score_k 
                
                self.baseline = self.baseline - self.alpha * baseline_gradient
                self.skater_scores[:, [k]] = skater_score_k - self.alpha * skater_gradients
                self.event_scores[[k], :] = event_score_k - self.alpha * event_gradients
                
            # Print difference in RMSE for last two iterations
            if i == (n_iter-1):
                rmse_old = np.sqrt(np.nanmean(diff**2))
                diff = self.skater_scores @ self.event_scores + self.baseline - true_scores
                rmse_new = np.sqrt(np.nanmean(diff**2))
                print(f'Alpha: {self.alpha}, Lambda: {self.lambda_reg}, Iter: {n_iter}, Last RMSE: {round(rmse_new, 2)}, Delta RMSE: {round(rmse_new - rmse_old, 10)}')

        self.skater_scores = pd.DataFrame(self.skater_scores, index=skater_names)
        self.event_scores = pd.DataFrame(self.event_scores.T, index=event_names)
        
        self.skater_scores.sort_values(by=0, ascending=False, inplace=True)
        self.event_scores.sort_values(by=0, ascending=False, inplace=True)
        

    def evaluate_rmse_over_years(self, years, season_df, world_df, **kwargs):
        rmses = []
        for year in years:
            season_scores, world_scores = get_yearly_scores(year, season_df, world_df)
            self.fit(season_scores, **kwargs)
            rmse = self.evaluate_rmse(season_scores)
            rmses.append(rmse)
        return pd.DataFrame({'year': years, 'rmse': rmses}).sort_values(by='year')

Train over all years in training set

In [None]:
hybrid = Hybrid(alpha=0.001, n_factors=1, lambda_reg=10)
hybrid_train_eval = hybrid.evaluate_over_years(train_years, season_train, world_train, 
                    n_iter=1000, seed=42)
hybrid_train_eval

Regularization

In [None]:
for lambda_reg in [0, 0.1, 1, 2, 5, 10]:
    hybrid = Hybrid(lambda_reg=lambda_reg, alpha=0.001, n_factors=1)
    hybrid_train_eval = hybrid.evaluate_over_years(train_years, season_train, world_train, 
                                                   n_iter=1000, seed=42)
    print(lambda_reg, hybrid_train_eval['rmse'].mean(), hybrid_train_eval['tau'].mean())

In [None]:
hybrid = Hybrid(alpha=0.001, n_factors=1, lambda_reg=1)
hybrid.fit(season_scores, n_iter=1000, seed=42)
hybrid_train_eval = hybrid.evaluate_over_years(train_years, season_train, world_train, 
                    n_iter=1000, seed=42)
hybrid_train_eval

In [None]:
for lambda_reg in [0, 1]:
    hybrid = Hybrid(lambda_reg=lambda_reg, alpha=0.001, n_factors=1)
    hybrid_test_eval = hybrid.evaluate_over_years(test_years, season_test, world_test,
                                                  n_iter=1000, seed=42)
    print(lambda_reg, hybrid_test_eval['rmse'].mean(), hybrid_test_eval['tau'].mean())

In [None]:
avg = AverageScore()
avg_test_eval = avg.evaluate_over_years(test_years, season_test, world_test)
avg_test_eval['tau'].mean()

### Multiple latent factors

In [None]:
random_state = np.random.RandomState(seed=42)
f1_years = list(random_state.choice(train_years, 5, replace=False))
f2_years = [year for year in train_years if year not in f1_years]

In [None]:
def get_season_and_world_scores(scores):
    all_season_scores = {}
    all_world_scores = {}

    for year in range(2005, 2020):
        season_scores = scores.loc[(scores['year']==year) & (scores['event']!='WR')].copy()
        world_scores = scores.loc[(scores['year']==year) & (scores['event']=='WR'), ['name', 'score']]
        world_scores = world_scores.set_index('name').squeeze()
        all_season_scores[year] = season_scores
        all_world_scores[year] = world_scores
    return all_season_scores, all_world_scores

In [None]:
all_season_scores, all_world_scores = get_season_and_world_scores(male_scores)

In [None]:
all_normalized_scores = {}
all_pair_diffs = {}

for year in train_years:
    season_scores = all_season_scores[year]
    world_scores = all_world_scores[year]
    hybrid = Hybrid(alpha=0.001, n_factors=5, lambda_reg=10)
    hybrid.fit(season_scores, n_iter=1000, seed=42)    
    hybrid_scores = hybrid.skater_scores
    
    normalized_scores = (hybrid_scores - hybrid_scores.mean(axis=0)) / hybrid_scores.std(axis=0)
    normalized_scores = normalized_scores.reindex(world_scores.index).dropna()
    all_normalized_scores[year] = normalized_scores
    
    pair_diffs = np.array(list(skater1 - skater2 for skater1, skater2 in combinations(normalized_scores.values, 2)))
    all_pair_diffs[year] = pair_diffs

In [None]:
X_train = np.vstack((all_pair_diffs[year] for year in f2_years))
y_train = np.full(len(X_train), 1)
n_coefs = X_train.shape[1]

In [None]:
log = BatchLogistic(theta=np.full(n_coefs, 0.5), alpha=0.001, lambda_reg=10)
log.fit(X_train, y_train, n_iter=1000)
log.theta

In [None]:
f1_taus = []
for year in f1_years:
    print(year)
    normalized_scores = all_normalized_scores[year]
    combined_scores =  pd.Series(normalized_scores @ log.theta, index=normalized_scores.index)
    combined_ranking, world_ranking = return_ranking(combined_scores, all_world_scores[year])
    f1_taus.append(calculate_kendall_tau(combined_ranking, world_ranking))
    
print(np.array(f1_taus).mean())

In [None]:
f2_taus = []
for year in f2_years:
    print(year)
    normalized_scores = all_normalized_scores[year]
    combined_scores =  pd.Series(normalized_scores @ log.theta, index=normalized_scores.index)
    combined_ranking, world_ranking = return_ranking(combined_scores, all_world_scores[year])
    f2_taus.append(calculate_kendall_tau(combined_ranking, world_ranking))
    
print(np.array(f2_taus).mean())

### Refactoring

In [None]:
class HybridLog:
    def __init__(self, n_factors, hybrid_lambda,
              hybrid_alpha=0.001, hybrid_iter=1000, hybrid_seed=42,
              log_alpha=0.001, log_iter=1000):
        self.n_factors = n_factors
        self.hybrid_lambda = hybrid_lambda
        self.hybrid_alpha = hybrid_alpha
        self.hybrid_iter = hybrid_iter
        self.hybrid_seed = hybrid_seed
        self.log_alpha = log_alpha
        self.log_iter = log_iter

    
    def fit_hybrid(self, season_df, world_df, train_years):
        # Train hybrid model on each training years to get latent factor values
        all_pair_diffs = {}
        for year in train_years:
            season_scores, world_scores = get_yearly_scores(year, season_df, world_df)
            hybrid = Hybrid(alpha=self.hybrid_alpha, n_factors=self.n_factors, lambda_reg=self.hybrid_lambda)
            hybrid.fit(season_scores, n_iter=self.hybrid_iter, seed=self.hybrid_seed)
            hybrid_skater_scores = hybrid.skater_scores

            normalized_scores = (hybrid_skater_scores - hybrid_skater_scores.mean(axis=0)) / hybrid_skater_scores.std(axis=0)
            normalized_scores = normalized_scores.reindex(world_scores.index).dropna()

            pair_diffs = np.array(list(skater1 - skater2 for skater1, skater2 in combinations(normalized_scores.values, 2)))
            all_pair_diffs[year] = pair_diffs

        # Train logistic regression on pairwise differences of latent factor values
        self.X_train = np.vstack((all_pair_diffs[year] for year in train_years))
        self.y_train = np.full(len(self.X_train), 1)
    
    
    def fit_log(self, log_lambda):
        log = BatchLogistic(theta=np.full(self.n_factors, 0.5), alpha=self.log_alpha, 
                            lambda_reg=log_lambda)
        log.fit(self.X_train, self.y_train, n_iter=self.log_iter)
        self.log_coefs = log.theta

    
    def predict(self, season_scores):
        hybrid = Hybrid(alpha=self.hybrid_alpha, n_factors=self.n_factors, lambda_reg=self.hybrid_lambda)
        hybrid.fit(season_scores, n_iter=self.hybrid_iter, seed=self.hybrid_seed)    
        hybrid_skater_scores = hybrid.skater_scores

        normalized_scores = (hybrid_skater_scores - hybrid_skater_scores.mean(axis=0)) / hybrid_skater_scores.std(axis=0)    
        combined_scores = pd.Series(normalized_scores @ self.log_coefs, index=normalized_scores.index)
        combined_scores.sort_values(ascending=False, inplace=True)
        return combined_scores


In [None]:
def average_kendall_tau(hybridlog, years, season_df, world_df):
    kendall_taus = []
    for year in years:
        season_scores, world_scores = get_yearly_scores(year, season_df, world_df)
        combined_scores = hybridlog.predict(season_scores)
        combined_ranking, world_ranking = return_ranking(combined_scores, world_scores)
        kendall_tau = calculate_kendall_tau(combined_ranking, world_ranking, verbose=False)
        kendall_taus.append(kendall_tau)

    return np.array(kendall_taus).mean()


def get_tau_train_val(season_df, world_df, train_years, val_years,
                      hybridlog, log_lambda):
    hybridlog.fit_log(log_lambda)
    avg_tau_train = average_kendall_tau(hybridlog, train_years, season_df, world_df)
    avg_tau_val = average_kendall_tau(hybridlog, val_years, season_df, world_df)
    return avg_tau_train, avg_tau_val

In [None]:
n_factors = []
hybrid_lambdas = []
log_lambdas = []
f1_tau_trains = []
f1_tau_vals = []
f2_tau_trains = []
f2_tau_vals = []

n_iter = 1000
for n_factor in [1, ]:
    for hybrid_lambda in [10]:
        n_factors.append(n_factor)
        hybrid_lambdas.append(hybrid_lambda)

        # Train hybrid models on each fold
        hybridlog1 = HybridLog(n_factors=n_factor, hybrid_lambda=hybrid_lambda,
                            hybrid_alpha=0.001, hybrid_iter=n_iter, hybrid_seed=42,
                            log_alpha=0.001, log_iter=n_iter)
        hybridlog1.fit_hybrid(season_male, world_male, f1_years)

        hybridlog2 = HybridLog(n_factors=n_factor, hybrid_lambda=hybrid_lambda,
                            hybrid_alpha=0.001, hybrid_iter=n_iter, hybrid_seed=42,
                            log_alpha=0.001, log_iter=n_iter)
        hybridlog2.fit_hybrid(season_male, world_male, f2_years)

      # Train log models on each fold and evaluate kendall tau
    for log_lambda in [10]:
        print(f'n_factor: {n_factor}, hybrid_lambda: {hybrid_lambda}, log_lambda: {log_lambda}')
        log_lambdas.append(log_lambda)
        f1_tau_train, f1_tau_val = get_tau_train_val(season_male, world_male,
                                                     f1_years, f2_years,
                                                     hybridlog1, log_lambda)
        f2_tau_train, f2_tau_val = get_tau_train_val(season_male, world_male,
                                                     f2_years, f1_years,
                                                     hybridlog2, log_lambda)
        print(f'f1_train: {f1_tau_train}, f1_val: {f1_tau_val}, f2_train: {f2_tau_train}, f2_val: {f2_tau_val}')
        f1_tau_trains.append(f1_tau_train)
        f1_tau_vals.append(f1_tau_val)
        f2_tau_trains.append(f2_tau_train)
        f2_tau_vals.append(f2_tau_val)

In [None]:
comparisons = pd.DataFrame({'n_factor': n_factors, 'hybrid_lambda': hybrid_lambdas, 'log_lambda': log_lambdas,
'f1_tau_train': f1_tau_trains, 'f1_tau_val': f1_tau_vals,
'f2_tau_train': f2_tau_trains, 'f2_tau_val': f2_tau_vals})
comparisons

In [None]:
hybrid_train_eval.loc[hybrid_train_eval['year'].isin(f2_years), 'tau'].mean()