# Merging and Preprocessing

### Load Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import naive_bayes
from datetime import timedelta
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Player News

In [3]:
news = pd.read_excel('../data/news_data.xlsx')
news.head()

Unnamed: 0,date,headline,name,news,team
0,2017-12-10 10:40:00,Bucs DT Gerald McCoy fears he suffered a torn ...,Gerald McCoy,\n McCoy has torn his biceps twice...,Buccaneers
1,2017-12-10 09:28:00,Coach Todd Bowles announced Josh McCown suffer...,Josh McCown,\n McCown was banged up in this on...,Jets
2,2017-12-10 09:20:00,Keelan Cole caught all three of his targets fo...,Keelan Cole,\n Cole reeled in a 75-yard touchd...,Jaguars
3,2017-12-10 09:15:00,Dede Westbrook caught 5-of-8 targets for 81 ya...,Dede Westbrook,\n Westbrook's first NFL touchdown...,Jaguars
4,2017-12-10 09:10:00,Marqise Lee caught 5-of-6 targets for 65 yards...,Marqise Lee,\n Lee has turned into a solid flo...,Jaguars


In [4]:
news.tail()

Unnamed: 0,date,headline,name,news,team
22476,2016-10-20 09:31:00,"Ladarius Green (ankle, PUP) said there is ""no ...",Ladarius Green,"\n Signed to a four-year, $20 mill...",Free Agent
22477,2016-10-20 09:28:00,Gerald McCoy (calf) practiced on a limited bas...,Gerald McCoy,\n It is a step in the right direc...,Buccaneers
22478,2016-10-20 09:25:00,Jordan Reed said he hid the concussion he suff...,Jordan Reed,"\n ""I kind of kept it to myself,"" ...",Redskins
22479,2016-10-20 09:13:00,Colts coach Chuck Pagano said DE Henry Anderso...,Henry Anderson,\n Anderson played through his kne...,Colts
22480,2016-10-20 09:10:00,"With Doug Martin (hamstring) out, Jacquizz Rod...",Jacquizz Rodgers,\n Rodgers was the unquestioned wo...,Buccaneers


### Preprocessing Player News

In [5]:
cities = {
    'Bills':'Buffalo',
    'Dolphins':'Miami',
    'Patriots':'New England',
    'Jets':'New York',
    'Ravens':'Baltimore',
    'Bengals':'Cincinnati',
    'Browns':'Cleveland',
    'Steelers':'Pittsburgh',
    'Texans':'Houston',
    'Colts':'Indianapolis',
    'Jaguars':'Jacksonville',
    'Titans':'Tennessee',
    'Broncos':'Denver',
    'Chiefs':'Kansas City',
    'Chargers':'Los Angeles',
    'Raiders':'Oakland',
    'Cowboys':'Dallas',
    'Giants':'New York',
    'Eagles':'Philadelphia',
    'Redskins':'Washington',
    'Bears':'Chicago',
    'Lions':'Detroit',
    'Packers':'Green Bay',
    'Vikings':'Minnesota',
    'Falcons':'Atlanta',
    'Panthers':'Carolina',
    'Saints':'New Orleans',
    'Buccaneers':'Tampa Bay',
    'Cardinals':'Arizona',
    'Rams':'Los Angeles',
    '49ers':'San Francisco',
    'Seahawks':'Seattle'  
}

default_cities = defaultdict(str)
for city in cities.keys():
    default_cities[city] = cities[city]

In [6]:
regex = re.compile('[^a-zA-Z]')
to_display = ['name','news','news_clean','news_unigrams','news_bigrams']
stopWords = set(stopwords.words('english'))

def process_text(row):
    '''
    if the news mentions that player -> self
    if the news mentions the player's team -> ownteam
    if the news mentions the player's city -> owncity
    '''
    parsed_sentence = str()
    names = row['name'].split()
    for word in row['news'].split():
        word = regex.sub('',word)
        if word in names:
            parsed_sentence = parsed_sentence+'self '
        elif word ==row['team']:
            parsed_sentence = parsed_sentence+'ownteam '
        elif word in default_cities[row['team']].split():
            parsed_sentence = parsed_sentence+'owncity '
        elif word in stopWords:
            pass
        else:
            parsed_sentence = parsed_sentence+word+' '
    return parsed_sentence

def get_bigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    stemmer = PorterStemmer()
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_tuples = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
    bigrams = []
    for bigram_tuple in bigram_tuples:
        x = "%s %s" % bigram_tuple
        bigrams.append(x)
        
    result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in bigrams]
    return result

def get_unigrams(myString):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(myString)
    stemmer = PorterStemmer()
    result = [stemmer.stem(w).lower() for w in tokens if w.lower() not in stopWords]
    return result

news['news_clean'] = news.apply(process_text, axis = 1)
news['news_unigrams'] = news['news_clean'].apply(lambda x: get_unigrams(x))
news['news_bigrams'] = news['news_clean'].apply(lambda x: get_bigrams(x))
news[to_display].head()

Unnamed: 0,name,news,news_clean,news_unigrams,news_bigrams
0,Gerald McCoy,\n McCoy has torn his biceps twice...,self torn biceps twice already pro career tear...,"[self, torn, bicep, twice, alreadi, pro, caree...","[falcon panther, he know, panther saint, saint..."
1,Josh McCown,\n McCown was banged up in this on...,self banged one injuring back second quarter h...,"[self, bang, one, injur, back, second, quarter...","[bryce petti, hell undergo, it tough, mccown g..."
2,Keelan Cole,\n Cole reeled in a 75-yard touchd...,self reeled yard touchdown beautiful catch thr...,"[self, reel, yard, touchdown, beauti, catch, t...","[blake bortl, bortl give, it second, wr head, ..."
3,Dede Westbrook,\n Westbrook's first NFL touchdown...,Westbrooks first NFL touchdown came yard corne...,"[westbrook, first, nfl, touchdown, came, yard,...","[hi eight, nfl touchdown, texan defens, came y..."
4,Marqise Lee,\n Lee has turned into a solid flo...,self turned solid floor play fantasy five catc...,"[self, turn, solid, floor, play, fantasi, five...","[after catch, wr option, week meet, with week,..."


### Player Scores and Averages

In [7]:
#Match up scores with articles
scores = pd.read_csv('../data/fantasy_points_data.csv', parse_dates= True)
scores.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points
0,Drew Brees,NO,QB,2016,1,37.7
1,Andrew Luck,IND,QB,2016,1,37.4
2,Alex Smith,KC,QB,2016,1,33.7
3,Jameis Winston,TB,QB,2016,1,30.4
4,Matthew Stafford,DET,QB,2016,1,29.5


In [8]:
def yearly_average(row):
    if row['Year']==2016:
        return row['Points']/weeks_in_2016
    elif row['Year']==2017:
        return row['Points']/weeks_in_2017

In [9]:
#Computing average over all weeks in year
weeks_in_2016 = scores[scores.Year==2016]['Week'].max()
weeks_in_2017 = scores[scores.Year==2017]['Week'].max()
avg_scores = scores.groupby(['Player','Year'], as_index=False)['Points'].sum()
avg_scores['Avg_Points']= avg_scores.apply(yearly_average,axis=1)
avg_scores.columns = ['Player', 'Year', 'SeasonPoints', 'AvgSeasonPoints']

In [10]:
#Computing averages over all weeks *played* in a year
wkavg_scores = scores.groupby(['Player', 'Year'], as_index=False)['Points'].mean()
wkavg_scores.columns = ['Player', 'Year', 'AvgWkPoints']
wkavg_scores.head()

Unnamed: 0,Player,Year,AvgWkPoints
0,A.J. Derby,2016,3.425
1,A.J. Derby,2017,4.05
2,A.J. Green,2016,13.377778
3,A.J. Green,2017,10.692308
4,AJ McCarron,2017,1.9


In [11]:
scores_combined = scores.merge(avg_scores, 
                               how='inner', 
                               on=['Player','Year'])
scores_combined = scores_combined.merge(wkavg_scores,
                                  how='inner',
                                  on=['Player', 'Year'])
scores_combined['Diff_from_Avg'] = scores_combined['Points'] - scores_combined['AvgSeasonPoints']
scores_combined['Diff_from_WkAvg'] = scores_combined['Points'] - scores_combined['AvgWkPoints'] 
scores_combined.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points,SeasonPoints,AvgSeasonPoints,AvgWkPoints,Diff_from_Avg,Diff_from_WkAvg
0,Drew Brees,NO,QB,2016,1,37.7,338.5,19.911765,21.15625,17.788235,16.54375
1,Drew Brees,NO,QB,2016,2,14.5,338.5,19.911765,21.15625,-5.411765,-6.65625
2,Drew Brees,NO,QB,2016,3,25.9,338.5,19.911765,21.15625,5.988235,4.74375
3,Drew Brees,NO,QB,2016,4,11.8,338.5,19.911765,21.15625,-8.111765,-9.35625
4,Drew Brees,NO,QB,2016,6,32.6,338.5,19.911765,21.15625,12.688235,11.44375


### Player predicted scores

In [13]:
#Pull in predicted scores
preds2016 = pd.read_csv('../data/full_2016_projections.csv', parse_dates= True)
preds2017 = pd.read_csv('../data/full_2017_projections.csv', parse_dates = True)
preds2016['Year'] = 2016
preds2017['Year'] = 2017
preds = pd.concat([preds2016, preds2017])

In [14]:
preds.head()

Unnamed: 0,Player,Pos,Week,Team,Opp,Pass Yds,TD,Int,Rush Yds,TD.1,Rec,Yds,TD.2,Fantasy Points,Year
0,David Johnson,RB,10,ARI,SF,0.0,0.0,0.0,112.4,1.1,4.2,40.8,0.3,27.52,2016
1,Cam Newton,QB,2,CAR,SF,243.5,1.8,0.6,41.6,0.6,0.0,0.0,0.0,23.1,2016
2,Aaron Rodgers,QB,9,GB,IND,295.6,2.2,0.8,20.4,0.4,0.0,0.0,0.0,22.86,2016
3,Matt Ryan,QB,9,ATL,TB,323.1,2.6,0.8,9.3,0.1,0.0,0.0,0.0,22.65,2016
4,Cam Newton,QB,4,CAR,ATL,243.9,1.8,0.8,41.4,0.6,0.0,0.0,0.0,22.5,2016


In [15]:
#Restructuring predicted points to mirror actual score data
preds = preds[['Player', 'Team', 'Pos', 'Year', 'Week', 'Fantasy Points']]
preds.columns = ['Player', 'Team', 'Position', 'Year', 'Week', 'Points']

In [16]:
#Replacing scores with predicted scores
scores = preds.copy(deep=True)

In [17]:
#Computing average over all weeks in year
weeks_in_2016 = scores[scores.Year==2016]['Week'].max()
weeks_in_2017 = scores[scores.Year==2017]['Week'].max()
avg_scores = scores.groupby(['Player','Year'], as_index=False)['Points'].sum()
avg_scores['Avg_Points']= avg_scores.apply(yearly_average,axis=1)
avg_scores.columns = ['Player', 'Year', 'SeasonPoints', 'AvgSeasonPoints']

In [18]:
#Computing averages over all weeks *played* in a year
wkavg_scores = scores.groupby(['Player', 'Year'], as_index=False)['Points'].mean()
wkavg_scores.columns = ['Player', 'Year', 'AvgWkPoints']
wkavg_scores.head()

Unnamed: 0,Player,Year,AvgWkPoints
0,AJ Green,2016,19.012
1,AJ Green,2017,16.262143
2,Aaron Jones,2017,9.964
3,Aaron Rodgers,2016,19.733333
4,Aaron Rodgers,2017,19.687143


In [19]:
scores_combined = scores.merge(avg_scores, 
                               how='inner', 
                               on=['Player','Year'])
scores_combined = scores_combined.merge(wkavg_scores,
                                  how='inner',
                                  on=['Player', 'Year'])
scores_combined['Diff_from_Avg'] = scores_combined['Points'] - scores_combined['AvgSeasonPoints']
scores_combined['Diff_from_WkAvg'] = scores_combined['Points'] - scores_combined['AvgWkPoints'] 
scores_combined.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points,SeasonPoints,AvgSeasonPoints,AvgWkPoints,Diff_from_Avg,Diff_from_WkAvg
0,David Johnson,ARI,RB,2016,10,27.52,323.05,20.190625,21.536667,7.329375,5.983333
1,David Johnson,ARI,RB,2016,13,25.62,323.05,20.190625,21.536667,5.429375,4.083333
2,David Johnson,ARI,RB,2016,14,25.23,323.05,20.190625,21.536667,5.039375,3.693333
3,David Johnson,ARI,RB,2016,15,25.78,323.05,20.190625,21.536667,5.589375,4.243333
4,David Johnson,ARI,RB,2016,12,24.09,323.05,20.190625,21.536667,3.899375,2.553333


### Merging

In [20]:
start_of_season = {2016:pd.datetime.strptime('2016-09-08', '%Y-%m-%d'),
                  2017:pd.datetime.strptime('2017-09-08', '%Y-%m-%d')}

In [21]:
#Add in the first and last date for a given playing week
scores_combined['max_date'] = scores_combined.apply(lambda x: start_of_season[x['Year']] + timedelta(weeks = x['Week']), axis=1)
scores_combined['min_date'] = scores_combined.apply(lambda x: start_of_season[x['Year']] + timedelta(weeks = x['Week'] - 1), axis=1)

In [22]:
scores_combined.dtypes

Player                     object
Team                       object
Position                   object
Year                        int64
Week                        int64
Points                    float64
SeasonPoints              float64
AvgSeasonPoints           float64
AvgWkPoints               float64
Diff_from_Avg             float64
Diff_from_WkAvg           float64
max_date           datetime64[ns]
min_date           datetime64[ns]
dtype: object

In [23]:
news.dtypes

date             datetime64[ns]
headline                 object
name                     object
news                     object
team                     object
news_clean               object
news_unigrams            object
news_bigrams             object
dtype: object

In [24]:
#Merge news with scores and filter out scores that are not within a playing week
combined = scores_combined.merge(news, how='inner', left_on='Player', right_on='name')
combined = combined[(combined['date'] <= combined['max_date']) & 
                    (combined['date'] >= combined['min_date'])]

In [25]:
combined.shape

(6912, 21)

In [26]:
mismatched_scores = scores_combined.merge(news, how='left', left_on='Player', right_on='name')
mismatched_scores = mismatched_scores[(mismatched_scores['date'] <= mismatched_scores['max_date']) 
                                      & (mismatched_scores['date'] >= mismatched_scores['min_date']) 
                                      #& (mismatched_scores['name'] == np.nan)]
                                     ]
mismatched_news = scores_combined.merge(news, how='right', left_on='Player', right_on='name')
#mismatched_news = mismatched_news[mismatched_news['Player'] == np.nan]

In [27]:
#Number of players with no scores
len(news[np.invert(np.in1d(news.name, scores_combined.Player))].name.value_counts())

1638

In [28]:
#Count of news articles with no scores
np.sum(news[np.invert(np.in1d(news.name, scores_combined.Player))].name.value_counts())

9171

In [29]:
news[np.invert(np.in1d(news.name, scores_combined.Player))].name.value_counts()

DeVante Parker       105
Odell Beckham         88
C.J. Prosise          85
T.Y. Hilton           73
A.J. Green            67
Tony Romo             59
Markus Wheaton        55
T.J. Yeldon           49
Luke Kuechly          49
J.J. Nelson           47
Breshad Perriman      44
Kendall Wright        44
C.J. Fiedorowicz      44
Steve Smith Sr.       43
Victor Cruz           42
Tyron Smith           40
C.J. Anderson         40
Vontae Davis          39
John Ross             39
Mike Williams         37
Ryan Kalil            37
Terron Armstead       35
Eli Rogers            34
Braxton Miller        34
Jesse James           33
Ben Watson            33
George Kittle         33
Virgil Green          33
Vance McDonald        33
Jimmy Smith           32
                    ... 
Steven Terrell         1
Ian Williams           1
Isaiah Battle          1
LaMarr Woodley         1
Jacques McClendon      1
Byron Bell             1
Josh Martin            1
Joseph Randle          1
Corey Peters           1


In [30]:
mismatched_scores.head()

Unnamed: 0,Player,Team,Position,Year,Week,Points,SeasonPoints,AvgSeasonPoints,AvgWkPoints,Diff_from_Avg,...,max_date,min_date,date,headline,name,news,team,news_clean,news_unigrams,news_bigrams
40,David Johnson,ARI,RB,2016,10,27.52,323.05,20.190625,21.536667,7.329375,...,2016-11-17,2016-11-10,2016-11-13 07:43:00,David Johnson rushed 19 times for 55 yards and...,David Johnson,\n Even in what could be classifie...,Cardinals,Even could classified underwhelming game self ...,"[even, could, classifi, underwhelm, game, self...","[both came, even could, rb play, big catch, ca..."
79,David Johnson,ARI,RB,2016,13,25.62,323.05,20.190625,21.536667,5.429375,...,2016-12-08,2016-12-01,2016-12-04 07:44:00,David Johnson rushed 18 times for 84 yards and...,David Johnson,\n There is not much left to say a...,Cardinals,There much left say self self A dominant force...,"[much, left, say, self, self, domin, forc, gro...","[a domin, rb everi, the fact, there much, almo..."
121,David Johnson,ARI,RB,2016,14,25.23,323.05,20.190625,21.536667,5.039375,...,2016-12-15,2016-12-08,2016-12-11 05:45:00,David Johnson rushed 20 times for 80 yards and...,David Johnson,\n It was not the best performance...,Cardinals,It best performance self lost fumble early gam...,"[best, perform, self, lost, fumbl, earli, game...","[dolphin defens, he also, it best, also sever,..."
163,David Johnson,ARI,RB,2016,15,25.78,323.05,20.190625,21.536667,5.589375,...,2016-12-22,2016-12-15,2016-12-18 08:18:00,David Johnson rushed 12 times for 53 yards and...,David Johnson,\n Johnson was somewhat quiet in t...,Cardinals,self somewhat quiet first half gaining yards ...,"[self, somewhat, quiet, first, half, gain, yar...","[christma eve, eve toptwo, it might, kerwynn w..."
209,David Johnson,ARI,RB,2016,12,24.09,323.05,20.190625,21.536667,3.899375,...,2016-12-01,2016-11-24,2016-11-28 02:24:00,David Johnson dislocated his finger in Sunday'...,David Johnson,\n Johnson checked out a few times...,Cardinals,self checked times second half Sunday ultimate...,"[self, check, time, second, half, sunday, ulti...","[hell like, rb redskin, sunday ultim, check ti..."


### Output

In [49]:
#combined.to_csv('../data/news_and_scores.csv', index=False)
combined.to_excel('../data/news_and_scores.xlsx', index=False)

In [31]:
mismatched_scores.to_csv('../data/mismatched_scores.csv', index=False)
mismatched_news.to_csv('../data/mismatched_news.csv', index=False)

In [32]:
mismatched_scores.shape

(6912, 21)

In [33]:
mismatched_news.shape

(217010, 21)