diff --git a/constants.py b/constants.py index 1610592..e5ac4c1 100644 --- a/constants.py +++ b/constants.py @@ -1,4 +1,4 @@ -TOP_N = 50 +TOP_N = 100 BASE_YEAR = 2013 # The logic should handle most trades properly, but in cases where there are diff --git a/main.py b/main.py index fbcf4fe..2becd21 100644 --- a/main.py +++ b/main.py @@ -5,13 +5,12 @@ from constants import ID from constants import TOP_N from constants import SPECIAL_CASE_TRADES -from evaluation import compare_predictions from evaluation import pos_rank_row_to_str from evaluation import position_ranking_lists from parser import load_files from prediction import construct_feature_matrix from prediction import cross_validate -from prediction import predict_scores +from prediction import predict_current_year logging.getLogger().setLevel(logging.ERROR) @@ -46,6 +45,9 @@ def id_to_useful_name(id): return (any_year['Name'], any_year['Tm'], any_year['FantasyFantPos']) + current_players = set(id for id in id2year2stats if BASE_YEAR - 1 in + id2year2stats[id]) + matrix, identifiers, features = construct_feature_matrix(id2year2stats) id2name = {ident[ID]: id_to_useful_name(ident[ID]) for ident in identifiers} @@ -53,13 +55,10 @@ def id_to_useful_name(id): from sklearn import linear_model from sklearn import ensemble from sklearn import svm - #model = linear_model.LinearRegression() - #model = linear_model.Lasso(max_iter=100000) - model = ensemble.RandomForestRegressor() - #model = ensemble.GradientBoostingRegressor() seed = randint(0, 2**32 - 1) for model in [linear_model.LinearRegression(), + linear_model.Ridge(), ensemble.RandomForestRegressor(), ensemble.ExtraTreesRegressor(), ensemble.AdaBoostRegressor(), @@ -69,20 +68,21 @@ def id_to_useful_name(id): ]: print str(model).split('(')[0] cross_validate(matrix, identifiers, features, id2name, model, - n_folds=5, seed=seed) + n_folds=10, seed=seed) print - return - past_scores, past_predictions, current_predictions, current_ids = \ - predict_scores(matrix, identifiers, features, model) + model = ensemble.RandomForestRegressor() + current_predictions, current_ids = \ + predict_current_year(matrix, identifiers, features, id2name, model) + + current_predictions, current_ids = zip( + *[(pred, ident) for pred, ident + in zip(current_predictions, current_ids) + if ident[ID] in current_players]) - past_ranks = position_ranking_lists(identifiers, past_scores, id2name) - past_predicted_ranks = position_ranking_lists( - identifiers, past_predictions, id2name) current_predicted_ranks = position_ranking_lists( current_ids, current_predictions, id2name) - compare_predictions(past_ranks, past_predicted_ranks) dump_predictions(current_predicted_ranks) return diff --git a/prediction.py b/prediction.py index 212802b..310f449 100644 --- a/prediction.py +++ b/prediction.py @@ -5,7 +5,6 @@ from numpy import array from numpy import empty -from numpy import mean from numpy import nan from sklearn.cross_validation import KFold from sklearn.preprocessing import Imputer @@ -218,7 +217,6 @@ def get_features_objective(_matrix): y = _matrix[:, objective_index] return X, y - taus_by_deltapos = {} accum_test_identifiers = [] accum_test_scores = [] accum_test_preds = [] @@ -226,12 +224,11 @@ def get_features_objective(_matrix): enumerate(KFold(n=matrix.shape[0], n_folds=n_folds, shuffle=True, random_state=seed)): imputer = Imputer() - scaler = StandardScaler() + scaler = StandardScaler() # Need to standardize for eg SVR train_matrix = matrix[train_index, :] test_matrix = matrix[test_index, :] imputer.fit(train_matrix) - train_imputed = imputer.transform(train_matrix) - train_imputed = scaler.fit_transform(train_imputed) + train_imputed = scaler.fit_transform(imputer.transform(train_matrix)) test_imputed = scaler.transform(imputer.transform(test_matrix)) X_train, y_train = get_features_objective(train_imputed) @@ -249,25 +246,34 @@ def get_features_objective(_matrix): pos_ranks_pred = position_ranking_lists( accum_test_identifiers, accum_test_preds, id2name) taus = compute_taus(pos_ranks_true, pos_ranks_pred) - for deltapos, tauvals in taus.iteritems(): - taus_by_deltapos.setdefault(deltapos, []).append(tauvals) - for deltapos in sorted(taus_by_deltapos, key=lambda x: (x[1], x[0])): - print deltapos, taus_by_deltapos[deltapos] + for deltapos in sorted(taus, key=lambda x: (x[1], x[0])): + print deltapos, taus[deltapos] return -predict_scores = None - def predict_current_year(matrix, identifiers, features, id2name, model): imputed_matrix = Imputer().fit_transform(matrix) + #scaled_matrix = StandardScaler().fit_transform(imputed_matrix) + scaled_matrix = imputed_matrix + feature_cols = [idx for idx, (feat, delta) in enumerate(features) + if delta != 0] + objective_index = features.index(('fantasy_points', 0)) + + def get_features_objective(_matrix): + X = _matrix[:, feature_cols] + y = _matrix[:, objective_index] + return X, y + + X_train, y_train = get_features_objective(scaled_matrix) + model.fit(X_train, y_train) # Now, take the delta=1 rows (containing all our data) and make delta=0 # rows by incrementing the delta indices for tracked stats and incrementing # age. This will be used with the trained model to predict this year. delta_1_indices = [idx for idx, ident in enumerate(identifiers) if ident[DELTA] == 1] - delta_1_rows = imputed_matrix[delta_1_indices, :] + delta_1_rows = scaled_matrix[delta_1_indices, :] delta_1_dicts = (dict(zip(features, row)) for row in delta_1_rows) def shift_delta(feature_dict): @@ -296,4 +302,4 @@ def shift_delta(feature_dict): current_year_idents.append(copy(identifiers[idx])) current_year_idents[-1][DELTA] = 0 - return Y, y_pred, current_year_predictions, current_year_idents + return current_year_predictions, current_year_idents