In [1]:
import pandas as pd
import requests
import re
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from nltk import tokenize, stem, corpus

In [2]:
df = pd.read_csv('../data/raw/baseball/Master.csv')

In [225]:
df.shape

(18846, 25)

In [8]:
df['nameFull'] = df['nameFirst'] + ' ' + df['nameLast']

In [224]:
df[df['nameFull'].notnull()].shape

(18807, 25)

In [10]:
sample = df[['playerID', 'nameFull']].sample(n=100)

In [11]:
sample.head()

Unnamed: 0,playerID,nameFull
7049,harrera01,Ray Harrell
17950,weisswa01,Walt Weiss
14272,rigglji99,Jim Riggleman
1930,brottto01,Tony Brottem
524,austiri01,Rick Austin


In [19]:
query_article_url = 'https://en.wikipedia.org/w/api.php'

search_params = {
    'action' : 'query',
    'format' : 'json',
    'list' : 'search',
    #'srsearch' : term,
    'srlimit' : 5
    #'srprop' : 'snippet|categorysnippet|sectionsnippet|titlesnippet'
}

In [212]:
search_params['srsearch'] = 'Whitey Alpermann'
results = requests.get(query_article_url, params=search_params).json()

In [213]:
results

{'batchcomplete': '',
 'query': {'search': [],
  'searchinfo': {'suggestion': 'whitey alperman',
   'suggestionsnippet': 'whitey <em>alperman</em>',
   'totalhits': 0}}}

In [22]:
get_article_params = {
    'action' : 'query',
    'format' : 'json',
    'prop' : 'revisions',
    'pageids' : '36494510|11173690|156002',
    'rvprop' : 'content'
}

requests.get(query_article_url, params=get_article_params).json()

{'batchcomplete': '',
 'query': {'pages': {'11173690': {'ns': 0,
    'pageid': 11173690,
    'revisions': [{'*': '{{Infobox NFL biography\n|name=Graham Harrell\n|image=Graham Harrell1.jpg\n|image_size=225\n|caption=Harrell with the [[Green Bay Packers|Packers]] in [[2011 Green Bay Packers season|2011]]\n|current_team=North Texas Mean Green\n|number=<!-- 6 -->\n|position=[[Offensive Coordinator]] & [[Quarterback]]s coach<!--[[Quarterback]]-->\n|birth_date={{birth date and age|1985|5|22}}\n|birth_place=[[Brownwood, Texas]]  <ref>{{cite web|last=Harrell|first=Graham|title=Texas, Birth Index, 1903-1997|url=https://familysearch.org/pal:/MM9.1.1/V697-2CK|work=FamilySearch.org|accessdate=13 December 2013}}</ref><ref>{{cite web|last=Harrell|first=Graham|title=Player Stats|url=http://www.nfl.com/player/grahamharrell/71321/profile|work=NFL.com|accessdate=13 December 2013}}</ref>\n|death_date= \n|death_place= \n|height_ft = 6\n|height_in = 2\n|weight_lbs = 215\n|high_school = [[Ennis High School|

In [51]:
%%time
article_data=[]
for id, name in zip(sample['playerID'], sample['nameFull']):
    search_params['srsearch'] = name
    results = requests.get(query_article_url, params=search_params).json()
    pageids = '|'.join([str(r['pageid']) for r in results['query']['search']])
    #print(pageids)
    
    get_article_params['pageids'] = pageids
    results = requests.get(query_article_url, params=get_article_params).json()
    pages = results['query']['pages']
    
    for key in pages:
        text = pages[key]['revisions'][0]['*']
        title = pages[key]['title']
        article_data.append({
            'id' : id,
            'name' : name,
            'pageid' : key,
            'title' : title,
            'text' : text
        })

Wall time: 54.6 s


In [52]:
article_df = pd.DataFrame(article_data)

In [54]:
article_df.shape

(500, 5)

In [None]:
article_df.to_csv('../data/processed/sample_articles.csv', index=False)

In [None]:
article_df.head()

In [56]:
article_df[['id', 'pageid']].to_csv('../data/processed/sample_articles_label.csv', index=False)

### After manually labeling the data...

In [4]:
article_df = pd.read_csv('../data/processed/sample_articles.csv', encoding="ISO-8859-1", dtype='object')
article_label_df = pd.read_csv('../data/processed/sample_articles_label.csv', dtype='object')

In [97]:
article_label_df.head()

Unnamed: 0,id,pageid,label
0,andrest01,55909,0
1,andrest01,99602,0
2,andrest01,116749,0
3,andrest01,7946185,0
4,andrest01,12839983,1


In [5]:
labeled_set = article_df.merge(article_label_df, on=['id','pageid'])[['text','label']]

In [6]:
labeled_set.drop_duplicates(inplace=True)

In [7]:
labeled_set.shape

(491, 2)

In [116]:
labeled_set.head()

Unnamed: 0,text,label
0,{{about|the band|the band's eponymous album|3 ...,0
1,{{Infobox NFL biography\n|name=Graham Harrell\...,0
2,{{Infobox baseball biography\n|name=Ray Harrel...,1
3,{{for|the type of radio station|border blaster...,0
4,{{Use mdy dates|date=November 2017}}\n{{Infobo...,0


In [8]:
stopwords = set(corpus.stopwords.words('english'))
stemmer = stem.snowball.EnglishStemmer()
def process_text(text):
    tokens = [stemmer.stem(t) for t in tokenize.word_tokenize(re.sub(r'\W', ' ', text.lower())) if t.isalpha()]
    return [t for t in tokens if t not in stopwords]

In [199]:
def train_and_score(data):
    train, test = train_test_split(data)
    X_train = train['text']
    y_train = train['label']
    X_test = test['text']
    y_test = test['label']

    tfidf_vect = TfidfVectorizer(tokenizer=process_text, max_df=0.95, min_df=0.01)
    tfidf_matrix = tfidf_vect.fit_transform(X_train)

    nb_model = MultinomialNB().fit(tfidf_matrix, y_train)

    pred_values = nb_model.predict_proba(tfidf_vect.transform(X_test))[:,1]
    #print(pred_values)

    return roc_auc_score(y_test.astype('int'), pred_values)

In [200]:
score = [train_and_score(labeled_set) for n in range(10)]

In [191]:
sum(score)/len(score) #tfidf no min/max df

0.94817975956915956

In [194]:
sum(score)/len(score) #tfidf max_df=0.95 min_df=0.01

0.97230283014643815

In [201]:
sum(score)/len(score) #tf max_df=0.95 min_df=0.01

0.94395389778094008

In [9]:
def train(data):
    X = data['text']
    y = data['label'].astype('int')
    
    tfidf_vect = TfidfVectorizer(tokenizer=process_text, max_df=0.95, min_df=0.01)
    tfidf_matrix = tfidf_vect.fit_transform(X)
    nb_model = MultinomialNB().fit(tfidf_matrix, y)
    
    return nb_model, tfidf_vect

In [11]:
nb, tfidf = train(labeled_set)

In [12]:
with open('../models/baseball_clf_nb_model.p', 'wb') as f:  pickle.dump(nb, f)

In [13]:
with open('../models/baseball_tfidf_vect.p', 'wb') as f:  pickle.dump(tfidf, f)