disable scaling and dump output

ihaque · Aug 25, 2013 · 167bff4 · 167bff4
1 parent c2fc7d4
commit 167bff4
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 28 deletions.
diff --git a/constants.py b/constants.py
@@ -1,4 +1,4 @@
-TOP_N = 50
+TOP_N = 100
 BASE_YEAR = 2013
 
 # The logic should handle most trades properly, but in cases where there are

diff --git a/main.py b/main.py
@@ -5,13 +5,12 @@
 from constants import ID
 from constants import TOP_N
 from constants import SPECIAL_CASE_TRADES
-from evaluation import compare_predictions
 from evaluation import pos_rank_row_to_str
 from evaluation import position_ranking_lists
 from parser import load_files
 from prediction import construct_feature_matrix
 from prediction import cross_validate
-from prediction import predict_scores
+from prediction import predict_current_year
 
 
 logging.getLogger().setLevel(logging.ERROR)
@@ -46,20 +45,20 @@ def id_to_useful_name(id):
         return (any_year['Name'], any_year['Tm'],
                 any_year['FantasyFantPos'])
 
+    current_players = set(id for id in id2year2stats if BASE_YEAR - 1 in
+                          id2year2stats[id])
+
     matrix, identifiers, features = construct_feature_matrix(id2year2stats)
     id2name = {ident[ID]: id_to_useful_name(ident[ID]) for ident in
                identifiers}
 
     from sklearn import linear_model
     from sklearn import ensemble
     from sklearn import svm
-    #model = linear_model.LinearRegression()
-    #model = linear_model.Lasso(max_iter=100000)
-    model = ensemble.RandomForestRegressor()
-    #model = ensemble.GradientBoostingRegressor()
 
     seed = randint(0, 2**32 - 1)
     for model in [linear_model.LinearRegression(),
+                  linear_model.Ridge(),
                   ensemble.RandomForestRegressor(),
                   ensemble.ExtraTreesRegressor(),
                   ensemble.AdaBoostRegressor(),
@@ -69,20 +68,21 @@ def id_to_useful_name(id):
                   ]:
         print str(model).split('(')[0]
         cross_validate(matrix, identifiers, features, id2name, model,
-                       n_folds=5, seed=seed)
+                       n_folds=10, seed=seed)
         print
 
-    return
-    past_scores, past_predictions, current_predictions, current_ids = \
-        predict_scores(matrix, identifiers, features, model)
+    model = ensemble.RandomForestRegressor()
+    current_predictions, current_ids = \
+        predict_current_year(matrix, identifiers, features, id2name, model)
+
+    current_predictions, current_ids = zip(
+        *[(pred, ident) for pred, ident
+          in zip(current_predictions, current_ids)
+          if ident[ID] in current_players])
 
-    past_ranks = position_ranking_lists(identifiers, past_scores, id2name)
-    past_predicted_ranks = position_ranking_lists(
-        identifiers, past_predictions, id2name)
     current_predicted_ranks = position_ranking_lists(
         current_ids, current_predictions, id2name)
 
-    compare_predictions(past_ranks, past_predicted_ranks)
     dump_predictions(current_predicted_ranks)
 
     return

diff --git a/prediction.py b/prediction.py
@@ -5,7 +5,6 @@
 
 from numpy import array
 from numpy import empty
-from numpy import mean
 from numpy import nan
 from sklearn.cross_validation import KFold
 from sklearn.preprocessing import Imputer
@@ -218,20 +217,18 @@ def get_features_objective(_matrix):
         y = _matrix[:, objective_index]
         return X, y
 
-    taus_by_deltapos = {}
     accum_test_identifiers = []
     accum_test_scores = []
     accum_test_preds = []
     for fold, (train_index, test_index) in \
             enumerate(KFold(n=matrix.shape[0], n_folds=n_folds, shuffle=True,
                             random_state=seed)):
         imputer = Imputer()
-        scaler = StandardScaler()
+        scaler = StandardScaler()  # Need to standardize for eg SVR
         train_matrix = matrix[train_index, :]
         test_matrix = matrix[test_index, :]
         imputer.fit(train_matrix)
-        train_imputed = imputer.transform(train_matrix)
-        train_imputed = scaler.fit_transform(train_imputed)
+        train_imputed = scaler.fit_transform(imputer.transform(train_matrix))
         test_imputed = scaler.transform(imputer.transform(test_matrix))
 
         X_train, y_train = get_features_objective(train_imputed)
@@ -249,25 +246,34 @@ def get_features_objective(_matrix):
     pos_ranks_pred = position_ranking_lists(
         accum_test_identifiers, accum_test_preds, id2name)
     taus = compute_taus(pos_ranks_true, pos_ranks_pred)
-    for deltapos, tauvals in taus.iteritems():
-        taus_by_deltapos.setdefault(deltapos, []).append(tauvals)
-    for deltapos in sorted(taus_by_deltapos, key=lambda x: (x[1], x[0])):
-        print deltapos, taus_by_deltapos[deltapos]
+    for deltapos in sorted(taus, key=lambda x: (x[1], x[0])):
+        print deltapos, taus[deltapos]
 
     return
 
 
-predict_scores = None
-
 def predict_current_year(matrix, identifiers, features, id2name, model):
     imputed_matrix = Imputer().fit_transform(matrix)
+    #scaled_matrix = StandardScaler().fit_transform(imputed_matrix)
+    scaled_matrix = imputed_matrix
+    feature_cols = [idx for idx, (feat, delta) in enumerate(features)
+                    if delta != 0]
+    objective_index = features.index(('fantasy_points', 0))
+
+    def get_features_objective(_matrix):
+        X = _matrix[:, feature_cols]
+        y = _matrix[:, objective_index]
+        return X, y
+
+    X_train, y_train = get_features_objective(scaled_matrix)
+    model.fit(X_train, y_train)
 
     # Now, take the delta=1 rows (containing all our data) and make delta=0
     # rows by incrementing the delta indices for tracked stats and incrementing
     # age. This will be used with the trained model to predict this year.
     delta_1_indices = [idx for idx, ident in enumerate(identifiers)
                        if ident[DELTA] == 1]
-    delta_1_rows = imputed_matrix[delta_1_indices, :]
+    delta_1_rows = scaled_matrix[delta_1_indices, :]
     delta_1_dicts = (dict(zip(features, row)) for row in delta_1_rows)
 
     def shift_delta(feature_dict):
@@ -296,4 +302,4 @@ def shift_delta(feature_dict):
         current_year_idents.append(copy(identifiers[idx]))
         current_year_idents[-1][DELTA] = 0
 
-    return Y, y_pred, current_year_predictions, current_year_idents
+    return current_year_predictions, current_year_idents