# Modeling

In [294]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from ast import literal_eval
from scipy.sparse import csr_matrix
from sklearn.svm import LinearSVR

In [295]:
data = pd.read_excel('../data/news_and_scores.xlsx')
data.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points,SeasonPoints,AvgSeasonPoints,AvgWkPoints,Diff_from_Avg,...,date,headline,name,news,team,news_clean,news_unigrams,news_bigrams,orig_unigrams,orig_bigrams
0,David Johnson,ARI,RB,2016,10,27.52,323.05,20.190625,21.536667,7.329375,...,2016-11-13 07:43:00,David Johnson rushed 19 times for 55 yards and...,David Johnson,\n Even in what could be classifie...,Cardinals,Even could classified underwhelming game self ...,"['even', 'could', 'classifi', 'underwhelm', 'g...","['both came', 'even could', 'rb play', 'big ca...","['even', 'could', 'classifi', 'underwhelm', 'g...","['35 rush', 'both came', ""cardin '"", 'midway t..."
1,David Johnson,ARI,RB,2016,13,25.62,323.05,20.190625,21.536667,5.429375,...,2016-12-04 07:44:00,David Johnson rushed 18 times for 84 yards and...,David Johnson,\n There is not much left to say a...,Cardinals,There much left say self self A dominant force...,"['much', 'left', 'say', 'self', 'self', 'domin...","['a domin', 'rb everi', 'the fact', 'there muc...","['much', 'left', 'say', 'david', 'johnson', '....","['a domin', 'rb1 everi', 'the fact', 'about da..."
2,David Johnson,ARI,RB,2016,14,25.23,323.05,20.190625,21.536667,5.039375,...,2016-12-11 05:45:00,David Johnson rushed 20 times for 80 yards and...,David Johnson,\n It was not the best performance...,Cardinals,It best performance self lost fumble early gam...,"['best', 'perform', 'self', 'lost', 'fumbl', '...","['dolphin defens', 'he also', 'it best', 'also...","['best', 'perform', 'johnson', ',', 'lost', 'f...","['- two', '100 total', 'dolphin defens', 'he a..."
3,David Johnson,ARI,RB,2016,15,25.78,323.05,20.190625,21.536667,5.589375,...,2016-12-18 08:18:00,David Johnson rushed 12 times for 53 yards and...,David Johnson,\n Johnson was somewhat quiet in t...,Cardinals,self somewhat quiet first half gaining yards ...,"['self', 'somewhat', 'quiet', 'first', 'half',...","['christma eve', 'eve toptwo', 'it might', 'ke...","['johnson', 'somewhat', 'quiet', 'first', 'hal...","['14 game', 'christma eve', 'it might', 'kerwy..."
4,David Johnson,ARI,RB,2016,12,24.09,323.05,20.190625,21.536667,3.899375,...,2016-11-28 02:24:00,David Johnson dislocated his finger in Sunday'...,David Johnson,\n Johnson checked out a few times...,Cardinals,self checked times second half Sunday ultimate...,"['self', 'check', 'time', 'second', 'half', 's...","['hell like', 'rb redskin', 'sunday ultim', 'c...","['johnson', 'check', 'time', 'second', 'half',...","[""' ll"", ', but', '- three', '161 yard', '21 t..."


In [297]:
data.columns

Index(['Player', 'Team', 'Position', 'Year', 'Week', 'Points', 'SeasonPoints',
       'AvgSeasonPoints', 'AvgWkPoints', 'Diff_from_Avg', 'Diff_from_WkAvg',
       'max_date', 'min_date', 'date', 'headline', 'name', 'news', 'team',
       'news_clean', 'news_unigrams', 'news_bigrams', 'orig_unigrams',
       'orig_bigrams'],
      dtype='object')

### Encode vocabulary

In [296]:
class Vocabulary(object):
    START_TOKEN = "<s>"
    END_TOKEN = "</s>"
    UNK_TOKEN = "<unk>"

    def __init__(self, tokens, size=None):
        self.unigram_counts = collections.Counter(tokens)
        self.num_unigrams = sum(self.unigram_counts.values())
        # leave space for "<s>", "</s>", and "<unk>"
        top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
        vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
                 [w for w,c in top_counts])

        # Assign an id to each word, by frequency
        self.id_to_word = dict(enumerate(vocab))
        self.word_to_id = {v:k for k,v in self.id_to_word.items()}
        self.size = len(self.id_to_word)
        if size is not None:
            assert(self.size <= size)

        # For convenience
        self.wordset = set(self.word_to_id.keys())

        # Store special IDs
        self.START_ID = self.word_to_id[self.START_TOKEN]
        self.END_ID = self.word_to_id[self.END_TOKEN]
        self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

    def words_to_ids(self, words):
        return [self.word_to_id.get(w, self.UNK_ID) for w in words]

    def ids_to_words(self, ids):
        return [self.id_to_word[i] for i in ids]

    def sentence_to_ids(self, words):
        return [self.START_ID] + self.words_to_ids(words) + [self.END_ID]

    def ordered_words(self):
        """Return a list of words, ordered by id."""
        return self.ids_to_words(range(self.size))

In [274]:
token_feed_unigram = (word for news in data['news_unigrams'] for word in literal_eval(news))
token_feed_bigram = (word for news in data['news_bigrams'] for word in literal_eval(news))

vocab_unigram = Vocabulary(token_feed_unigram)
vocab_bigram = Vocabulary(token_feed_bigram)

In [275]:
vocab_unigram.size

7978

In [276]:
vocab_bigram.size

118140

In [277]:
for word, count in vocab_unigram.unigram_counts.most_common(10):
    print("\"%s\": %d" % (word, count))

"self": 15193
"week": 7082
"yard": 4353
"game": 4334
"ownteam": 3069
"play": 3062
"touchdown": 2504
"first": 1822
"get": 1742
"pass": 1671


In [278]:
encoded_unigram = [collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['news_unigrams']]
encoded_bigram = [collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['news_bigrams']]

In [279]:
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [280]:
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))

In [281]:
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [282]:
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

In [None]:
# encoded_unigram = pd.DataFrame([collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['news_unigrams']], 
#                        columns = range(vocab_unigram.size))
# encoded_unigram.fillna(value= 0, inplace=True)
# encoded_bigram = pd.DataFrame([collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['news_bigrams']], 
#                        columns = range(vocab_bigram.size))
# encoded_bigram.fillna(value= 0, inplace=True)

In [None]:
# encoded_unigram.fillna(value= 0, inplace=True)
# encoded_bigram.fillna(value= 0, inplace=True)

### Split into train and test

In [228]:
X_train_uni_points, X_test_uni_points, y_train_uni_points, y_test_uni_points = \
        train_test_split(encoded_unigram_sparse, data['Points'], test_size = 0.25)
X_train_bi_points, X_test_bi_points, y_train_bi_points, y_test_bi_points = \
        train_test_split(encoded_bigram_sparse, data['Points'], test_size = 0.25)

### SVM

In [229]:
#Raw unigrams score was 6.612167 points

#### Unigrams against points with preprocessing

In [230]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_points, y_train_uni_points)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [231]:
np.mean(np.square(SVM.predict(X_test_uni_points) - np.array(y_test_uni_points, dtype = np.float32)))**.5

6.8545310196534768

#### Bigrams against points with preprocessing

In [232]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_points, y_train_bi_points)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [233]:
np.mean(np.square(SVM.predict(X_test_bi_points) - np.array(y_test_bi_points, dtype = np.float32)))**.5

7.3142876885748294

#### Unigrams against avg points with preprocessing

In [234]:
X_train_uni_avg, X_test_uni_avg, y_train_uni_avg, y_test_uni_avg = \
        train_test_split(encoded_unigram_sparse, data['Diff_from_Avg'], test_size = 0.25)
X_train_bi_avg, X_test_bi_avg, y_train_bi_avg, y_test_bi_avg = \
        train_test_split(encoded_bigram_sparse, data['Diff_from_Avg'], test_size = 0.25)

In [235]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_avg, y_train_uni_avg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [236]:
np.mean(np.square(SVM.predict(X_test_uni_avg) - np.array(y_test_uni_avg, dtype = np.float32)))**.5

5.9864342353352633

#### Bigrams against avg points with preprocessing

In [237]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_avg, y_train_bi_avg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [238]:
np.mean(np.square(SVM.predict(X_test_bi_avg) - np.array(y_test_bi_avg, dtype = np.float32)))**.5

5.9645152896373732

#### Unigrams against weekly avg points with preprocessing

In [239]:
data.columns

Index(['Player', 'Team', 'Position', 'Year', 'Week', 'Points', 'SeasonPoints',
       'AvgSeasonPoints', 'AvgWkPoints', 'Diff_from_Avg', 'Diff_from_WkAvg',
       'max_date', 'min_date', 'date', 'headline', 'name', 'news', 'team',
       'news_clean', 'news_unigrams', 'news_bigrams', 'Ratio_from_Avg',
       'Ratio_from_WkAvg'],
      dtype='object')

In [240]:
X_train_uni_wavg, X_test_uni_wavg, y_train_uni_wavg, y_test_uni_wavg = train_test_split(
            encoded_unigram_sparse, data['Diff_from_WkAvg'], test_size = 0.25)
X_train_bi_wavg, X_test_bi_wavg, y_train_bi_wavg, y_test_bi_wavg = train_test_split(
            encoded_bigram_sparse, data['Diff_from_WkAvg'], test_size = 0.25)

In [241]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_wavg, y_train_uni_wavg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [242]:
np.mean(np.square(SVM.predict(X_test_uni_wavg) - np.array(y_test_uni_wavg, dtype = np.float32)))**.5

5.5213611770729187

#### Bigrams against weekly avg points with preprocessing

In [243]:
SVM = LinearSVR(max_iter=10000, random_state = 42)
SVM.fit(X_train_bi_wavg, y_train_bi_wavg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=10000,
     random_state=42, tol=0.0001, verbose=0)

In [244]:
np.mean(np.square(SVM.predict(X_test_bi_wavg) - np.array(y_test_bi_wavg, dtype = np.float32)))**.5

5.6534714273003077

### Ratio from season average as a predictor

In [245]:
data['Ratio_from_Avg'] = data['Points'] / data['AvgSeasonPoints']
data['Ratio_from_WkAvg'] = data['Points'] / data['AvgWkPoints']

#### Unigrams

In [246]:
X_train_uni_ratio, X_test_uni_ratio, y_train_uni_ratio, y_test_uni_ratio = \
        train_test_split(encoded_unigram_sparse, data['Ratio_from_Avg'], test_size = 0.5)
X_train_bi_ratio, X_test_bi_ratio, y_train_bi_ratio, y_test_bi_ratio = \
        train_test_split(encoded_bigram_sparse, data['Ratio_from_Avg'], test_size = 0.5)

In [247]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_ratio, y_train_uni_ratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [248]:
np.mean(np.square(SVM.predict(X_test_uni_ratio) - np.array(y_test_uni_ratio, dtype = np.float32)))**.5

2.0704991428840449

#### Bigrams

In [249]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_ratio, y_train_bi_ratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [250]:
np.mean(np.square(SVM.predict(X_test_bi_ratio) - np.array(y_test_bi_ratio, dtype = np.float32)))**.5

2.0523903426339332

### Ratio from Weekly Average as a predictor

In [251]:
X_train_uni_wkratio, X_test_uni_wkratio, y_train_uni_wkratio, y_test_uni_wkratio = \
        train_test_split(encoded_unigram_sparse, data['Ratio_from_WkAvg'], test_size = 0.25)
X_train_bi_wkratio, X_test_bi_wkratio, y_train_bi_wkratio, y_test_bi_wkratio = \
        train_test_split(encoded_bigram_sparse, data['Ratio_from_WkAvg'], test_size = 0.25)

#### Unigrams

In [252]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_wkratio, y_train_uni_wkratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [253]:
np.mean(np.square(SVM.predict(X_test_uni_wkratio) - np.array(y_test_uni_wkratio, dtype = np.float32)))**.5

0.7262377173674246

#### Bigrams

In [254]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_wkratio, y_train_bi_wkratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [255]:
np.mean(np.square(SVM.predict(X_test_bi_wkratio) - np.array(y_test_bi_wkratio, dtype = np.float32)))**.5

0.74001396145068854

In [256]:
SVM.predict(X_test_bi_wkratio)

array([ 0.8775351,  0.8775351,  0.8775351, ...,  0.8775351,  0.8775351,
        0.8775351])

### Encode n-grams as present (instead of count)

In [283]:
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [284]:
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))

In [285]:
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [286]:
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

In [287]:
encoded_unigram_sparse.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

Rerun above code to replicate with boolean indicators instead of counts

### Cross Validation

In [261]:
from sklearn.model_selection import GridSearchCV

In [288]:
SVM = LinearSVR()
params = {}
grid = GridSearchCV(SVM, params)
grid.fit(encoded_unigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [289]:
SVM = LinearSVR()
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid = GridSearchCV(SVM, params, cv=5)
grid.fit(encoded_unigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [290]:
grid.best_score_

-0.31786175116647236

In [291]:
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid2 = GridSearchCV(SVM, params, cv=5)
grid2.fit(encoded_bigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [292]:
grid2.best_score_

-0.015749675526160627

In [293]:
grid2.best_params_

{'C': 1000.0, 'epsilon': 0.0005}

# Final Models

In [298]:
#tokenize
token_feed_unigram = (word for news in data['news_unigrams'] for word in literal_eval(news))
token_feed_bigram = (word for news in data['news_bigrams'] for word in literal_eval(news))
vocab_unigram = Vocabulary(token_feed_unigram)
vocab_bigram = Vocabulary(token_feed_bigram)

token_feed_unigram_orig = (word for news in data['orig_unigrams'] for word in literal_eval(news))
token_feed_bigram_orig = (word for news in data['orig_bigrams'] for word in literal_eval(news))
vocab_unigram_orig = Vocabulary(token_feed_unigram_orig)
vocab_bigram_orig = Vocabulary(token_feed_bigram_orig)

In [299]:
#encoding
encoded_unigram = [collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['news_unigrams']]
encoded_bigram = [collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['news_bigrams']]
encoded_unigram_orig = [collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['orig_unigrams']]
encoded_bigram_orig = [collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['orig_bigrams']]

In [300]:
#Sparse matrix
#Processed Unigram
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))
#Processed Bigram
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

#Original Unigram
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram_orig):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_unigram_orig_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram_orig), vocab_unigram_orig.size))

#Original Bigram
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram_orig):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_bigram_orig_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram_orig), vocab_bigram_orig.size))

In [314]:
#Unigram Counts No preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid = GridSearchCV(SVM, params, cv=5)
grid.fit(encoded_unigram_orig_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [311]:
#Brigram Counts No Preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid2 = GridSearchCV(SVM, params, cv=5)
grid2.fit(encoded_bigram_orig_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [312]:
#Unigram Counts With preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid3 = GridSearchCV(SVM, params, cv=5)
grid3.fit(encoded_unigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [313]:
#Brigram Counts With Preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid4 = GridSearchCV(SVM, params, cv=5)
grid4.fit(encoded_bigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [306]:
#Reencode all for presence instead of counts
#Processed Unigram
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))
#Processed Bigram
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

#Original Unigram
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram_orig):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_unigram_orig_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram_orig), vocab_unigram_orig.size))

#Original Bigram
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram_orig):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)
encoded_bigram_orig_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram_orig), vocab_bigram_orig.size))

In [315]:
#Unigram Presence No preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid5 = GridSearchCV(SVM, params, cv=5)
grid5.fit(encoded_unigram_orig_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [316]:
#Brigram Presence No Preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid6 = GridSearchCV(SVM, params, cv=5)
grid6.fit(encoded_bigram_orig_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [317]:
#Unigram Presence With preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid7 = GridSearchCV(SVM, params, cv=5)
grid7.fit(encoded_unigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [318]:
#Brigram Presence With Preprocessing
SVM = LinearSVR(max_iter=5000)
params = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
grid8 = GridSearchCV(SVM, params, cv=5)
grid8.fit(encoded_bigram_sparse, data['Diff_from_WkAvg'])

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=5000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [325]:
np.mean(np.square(grid.predict(encoded_unigram_orig_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.3514815422224595

In [326]:
np.mean(np.square(grid2.predict(encoded_bigram_orig_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.2581432159521442

In [327]:
np.mean(np.square(grid3.predict(encoded_unigram_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.7073729211807604

In [328]:
np.mean(np.square(grid4.predict(encoded_bigram_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.2604581541548598

In [329]:
np.mean(np.square(grid5.predict(encoded_unigram_orig_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.315361947454214

In [330]:
np.mean(np.square(grid6.predict(encoded_bigram_orig_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.2609544423649139

In [331]:
np.mean(np.square(grid7.predict(encoded_unigram_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.5472200281412976

In [332]:
np.mean(np.square(grid8.predict(encoded_bigram_sparse) - np.array(data['Diff_from_WkAvg'], dtype=np.float32)))

2.2689219898415121

### Naive Bayes

### CNN