# Modeling

In [66]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from ast import literal_eval
from scipy.sparse import csr_matrix
from sklearn.svm import LinearSVR

In [2]:
data = pd.read_excel('../data/news_and_scores.xlsx')
data.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points,SeasonPoints,AvgSeasonPoints,AvgWkPoints,Diff_from_Avg,...,max_date,min_date,date,headline,name,news,team,news_clean,news_unigrams,news_bigrams
0,Drew Brees,NO,QB,2016,1,37.7,338.5,19.911765,21.15625,17.788235,...,2016-09-15,2016-09-08,2016-09-11 04:33:00,Drew Brees completed 28-of-42 passes for 424 y...,Drew Brees,\n Aside from an early lost fumble...,Saints,Aside early lost fumble self played quarterbac...,"['asid', 'earli', 'lost', 'fumbl', 'self', 'pl...","['asid earli', 'the highlight', 'bumbl defens'..."
1,Drew Brees,NO,QB,2016,1,37.7,338.5,19.911765,21.15625,17.788235,...,2016-09-15,2016-09-08,2016-09-08 11:56:00,Drew Brees got a $30 million signing bonus on ...,Drew Brees,\n That bonus will be spread out a...,Saints,That bonus spread across next five years ownte...,"['bonu', 'spread', 'across', 'next', 'five', '...","['that bonu', 'the entir', 'across next', 'bas..."
2,Drew Brees,NO,QB,2016,2,14.5,338.5,19.911765,21.15625,-5.411765,...,2016-09-22,2016-09-15,2016-09-18 05:32:00,Drew Brees completed 29-of-44 passes for 263 y...,Drew Brees,\n Brees got off to a very slow st...,Saints,self got slow start averaging five yards per a...,"['self', 'got', 'slow', 'start', 'averag', 'fi...","['atlanta next', 'giant put', 'he didnt', 'it ..."
3,Drew Brees,NO,QB,2016,3,25.9,338.5,19.911765,21.15625,5.988235,...,2016-09-29,2016-09-22,2016-09-27 12:01:00,Drew Brees completed 36-of-54 passes for 376 y...,Drew Brees,"\n Unfortunately for the Saints, t...",Saints,Unfortunately ownteam defense couldnt stop Fal...,"['unfortun', 'ownteam', 'defens', 'couldnt', '...","['cobi fleener', 'diego week', 'fleener two', ..."
4,Drew Brees,NO,QB,2016,4,11.8,338.5,19.911765,21.15625,-8.111765,...,2016-10-06,2016-09-29,2016-10-02 08:25:00,Drew Brees completed 23-of-36 passes for 206 y...,Drew Brees,\n It was not a vintage performanc...,Saints,It vintage performance self He struggled accur...,"['vintag', 'perform', 'self', 'struggl', 'accu...","['charger gift', 'it vintag', 'panther home', ..."


### Encode vocabulary

In [3]:
class Vocabulary(object):
    START_TOKEN = "<s>"
    END_TOKEN = "</s>"
    UNK_TOKEN = "<unk>"

    def __init__(self, tokens, size=None):
        self.unigram_counts = collections.Counter(tokens)
        self.num_unigrams = sum(self.unigram_counts.values())
        # leave space for "<s>", "</s>", and "<unk>"
        top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
        vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
                 [w for w,c in top_counts])

        # Assign an id to each word, by frequency
        self.id_to_word = dict(enumerate(vocab))
        self.word_to_id = {v:k for k,v in self.id_to_word.items()}
        self.size = len(self.id_to_word)
        if size is not None:
            assert(self.size <= size)

        # For convenience
        self.wordset = set(self.word_to_id.keys())

        # Store special IDs
        self.START_ID = self.word_to_id[self.START_TOKEN]
        self.END_ID = self.word_to_id[self.END_TOKEN]
        self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

    def words_to_ids(self, words):
        return [self.word_to_id.get(w, self.UNK_ID) for w in words]

    def ids_to_words(self, ids):
        return [self.id_to_word[i] for i in ids]

    def sentence_to_ids(self, words):
        return [self.START_ID] + self.words_to_ids(words) + [self.END_ID]

    def ordered_words(self):
        """Return a list of words, ordered by id."""
        return self.ids_to_words(range(self.size))

In [4]:
token_feed_unigram = (word for news in data['news_unigrams'] for word in literal_eval(news))
token_feed_bigram = (word for news in data['news_bigrams'] for word in literal_eval(news))

vocab_unigram = Vocabulary(token_feed_unigram)
vocab_bigram = Vocabulary(token_feed_bigram)

In [5]:
vocab_unigram.size

9009

In [6]:
vocab_bigram.size

156134

In [7]:
for word, count in vocab_unigram.unigram_counts.most_common(10):
    print("\"%s\": %d" % (word, count))

"self": 22735
"week": 10076
"yard": 6303
"game": 6273
"play": 4594
"ownteam": 4360
"touchdown": 3528
"first": 2665
"get": 2491
"back": 2469


In [58]:
encoded_unigram = [collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['news_unigrams']]
encoded_bigram = [collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['news_bigrams']]

In [56]:
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [57]:
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))

In [59]:
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(z)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [60]:
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

In [None]:
# encoded_unigram = pd.DataFrame([collections.Counter(vocab_unigram.words_to_ids(x)) for x in data['news_unigrams']], 
#                        columns = range(vocab_unigram.size))
# encoded_unigram.fillna(value= 0, inplace=True)
# encoded_bigram = pd.DataFrame([collections.Counter(vocab_bigram.words_to_ids(x)) for x in data['news_bigrams']], 
#                        columns = range(vocab_bigram.size))
# encoded_bigram.fillna(value= 0, inplace=True)

In [None]:
# encoded_unigram.fillna(value= 0, inplace=True)
# encoded_bigram.fillna(value= 0, inplace=True)

### Split into train and test

In [228]:
X_train_uni_points, X_test_uni_points, y_train_uni_points, y_test_uni_points = \
        train_test_split(encoded_unigram_sparse, data['Points'], test_size = 0.25)
X_train_bi_points, X_test_bi_points, y_train_bi_points, y_test_bi_points = \
        train_test_split(encoded_bigram_sparse, data['Points'], test_size = 0.25)

### SVM

In [229]:
#Raw unigrams score was 6.612167 points

#### Unigrams against points with preprocessing

In [230]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_points, y_train_uni_points)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [231]:
np.mean(np.square(SVM.predict(X_test_uni_points) - np.array(y_test_uni_points, dtype = np.float32)))**.5

6.8545310196534768

#### Bigrams against points with preprocessing

In [232]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_points, y_train_bi_points)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [233]:
np.mean(np.square(SVM.predict(X_test_bi_points) - np.array(y_test_bi_points, dtype = np.float32)))**.5

7.3142876885748294

#### Unigrams against avg points with preprocessing

In [234]:
X_train_uni_avg, X_test_uni_avg, y_train_uni_avg, y_test_uni_avg = \
        train_test_split(encoded_unigram_sparse, data['Diff_from_Avg'], test_size = 0.25)
X_train_bi_avg, X_test_bi_avg, y_train_bi_avg, y_test_bi_avg = \
        train_test_split(encoded_bigram_sparse, data['Diff_from_Avg'], test_size = 0.25)

In [235]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_avg, y_train_uni_avg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [236]:
np.mean(np.square(SVM.predict(X_test_uni_avg) - np.array(y_test_uni_avg, dtype = np.float32)))**.5

5.9864342353352633

#### Bigrams against avg points with preprocessing

In [237]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_avg, y_train_bi_avg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [238]:
np.mean(np.square(SVM.predict(X_test_bi_avg) - np.array(y_test_bi_avg, dtype = np.float32)))**.5

5.9645152896373732

#### Unigrams against weekly avg points with preprocessing

In [239]:
data.columns

Index(['Player', 'Team', 'Position', 'Year', 'Week', 'Points', 'SeasonPoints',
       'AvgSeasonPoints', 'AvgWkPoints', 'Diff_from_Avg', 'Diff_from_WkAvg',
       'max_date', 'min_date', 'date', 'headline', 'name', 'news', 'team',
       'news_clean', 'news_unigrams', 'news_bigrams', 'Ratio_from_Avg',
       'Ratio_from_WkAvg'],
      dtype='object')

In [240]:
X_train_uni_wavg, X_test_uni_wavg, y_train_uni_wavg, y_test_uni_wavg = train_test_split(
            encoded_unigram_sparse, data['Diff_from_WkAvg'], test_size = 0.25)
X_train_bi_wavg, X_test_bi_wavg, y_train_bi_wavg, y_test_bi_wavg = train_test_split(
            encoded_bigram_sparse, data['Diff_from_WkAvg'], test_size = 0.25)

In [241]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_wavg, y_train_uni_wavg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [242]:
np.mean(np.square(SVM.predict(X_test_uni_wavg) - np.array(y_test_uni_wavg, dtype = np.float32)))**.5

5.5213611770729187

#### Bigrams against weekly avg points with preprocessing

In [243]:
SVM = LinearSVR(max_iter=10000, random_state = 42)
SVM.fit(X_train_bi_wavg, y_train_bi_wavg)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=10000,
     random_state=42, tol=0.0001, verbose=0)

In [244]:
np.mean(np.square(SVM.predict(X_test_bi_wavg) - np.array(y_test_bi_wavg, dtype = np.float32)))**.5

5.6534714273003077

### Ratio from season average as a predictor

In [245]:
data['Ratio_from_Avg'] = data['Points'] / data['AvgSeasonPoints']
data['Ratio_from_WkAvg'] = data['Points'] / data['AvgWkPoints']

#### Unigrams

In [246]:
X_train_uni_ratio, X_test_uni_ratio, y_train_uni_ratio, y_test_uni_ratio = \
        train_test_split(encoded_unigram_sparse, data['Ratio_from_Avg'], test_size = 0.5)
X_train_bi_ratio, X_test_bi_ratio, y_train_bi_ratio, y_test_bi_ratio = \
        train_test_split(encoded_bigram_sparse, data['Ratio_from_Avg'], test_size = 0.5)

In [247]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_ratio, y_train_uni_ratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [248]:
np.mean(np.square(SVM.predict(X_test_uni_ratio) - np.array(y_test_uni_ratio, dtype = np.float32)))**.5

2.0704991428840449

#### Bigrams

In [249]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_ratio, y_train_bi_ratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [250]:
np.mean(np.square(SVM.predict(X_test_bi_ratio) - np.array(y_test_bi_ratio, dtype = np.float32)))**.5

2.0523903426339332

### Ratio from Weekly Average as a predictor

In [251]:
X_train_uni_wkratio, X_test_uni_wkratio, y_train_uni_wkratio, y_test_uni_wkratio = \
        train_test_split(encoded_unigram_sparse, data['Ratio_from_WkAvg'], test_size = 0.25)
X_train_bi_wkratio, X_test_bi_wkratio, y_train_bi_wkratio, y_test_bi_wkratio = \
        train_test_split(encoded_bigram_sparse, data['Ratio_from_WkAvg'], test_size = 0.25)

#### Unigrams

In [252]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_uni_wkratio, y_train_uni_wkratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [253]:
np.mean(np.square(SVM.predict(X_test_uni_wkratio) - np.array(y_test_uni_wkratio, dtype = np.float32)))**.5

0.7262377173674246

#### Bigrams

In [254]:
SVM = LinearSVR(max_iter=20000, random_state = 42)
SVM.fit(X_train_bi_wkratio, y_train_bi_wkratio)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=20000,
     random_state=42, tol=0.0001, verbose=0)

In [255]:
np.mean(np.square(SVM.predict(X_test_bi_wkratio) - np.array(y_test_bi_wkratio, dtype = np.float32)))**.5

0.74001396145068854

In [256]:
SVM.predict(X_test_bi_wkratio)

array([ 0.8775351,  0.8775351,  0.8775351, ...,  0.8775351,  0.8775351,
        0.8775351])

### Encode n-grams as present (instead of count)

In [223]:
row = []
col = []
val = []
for x, count in enumerate(encoded_unigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [224]:
encoded_unigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_unigram), vocab_unigram.size))

In [225]:
row = []
col = []
val = []
for x, count in enumerate(encoded_bigram):
    for y, z in count.items():
        row.append(x)
        col.append(y)
        val.append(1)
row = np.array(row)
col = np.array(col)
val = np.array(val)

In [226]:
encoded_bigram_sparse = csr_matrix((val, (row, col)), shape=(len(encoded_bigram), vocab_bigram.size))

In [227]:
encoded_unigram_sparse.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

Rerun above code to replicate with boolean indicators instead of counts

### Cross Validation

2777

[ 0.8775351  0.8775351  0.8775351 ...,  0.8775351  0.8775351  0.8775351]


### Naive Bayes

### CNN