In [None]:
import requests
import pandas as pd
import numpy as np
import json
import time
import seaborn as sns
import textdistance
import config #personal file containing SQL database information
import mysql.connector
import random
from IPython.display import Image  
import pydotplus
from copy import deepcopy
import itertools
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as BS
%matplotlib inline


import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [None]:
#lets us see all columns in dataframes
pd.set_option('display.max_columns', None)

In [None]:
#Open the chrome window with selenium
driver = webdriver.Chrome()

In [None]:
#set the years of player data we want to scrape
years = list(range(2000,2020))

In [None]:
def draft_results(years):
    '''
    Grab NFL draft data for the range of years specified
    '''
    drafts = []
    
    for year in tqdm(years):
        time.sleep(random.choice([x/10 for x in range(3,8)]))
        driver.get(f'http://www.nfl.com/draft/history/fulldraft?season={year}&round=round1#round1')

        x = driver.find_element_by_class_name("draft-history-table")

        names = x.text
        names = names.split('\n')

        #drop the rows that dont contain player data
        names = [x for x in names if '#' not in x]
        
        lst = []
        for i in range(len(names)):
            if 'ROUND' in names[i]:
                lst.append(i)
        lst.pop(0)
        lst = [lst[0] , lst[1] - 1, lst[2] - 2, lst[3] - 3, lst[4] -4, lst[5] - 5]
        names = [x for x in names if 'ROUND' not in x]

        
        #turn strings into lists
        for i in range(len(names)):
            names[i] = names[i].split()


        #if the team name contains 2 words, drop the first then drop the location of all teams
        for row in names:
            if row[1] in ['San','New','Green','Tampa','Los','Kansas','St.']:
                row.pop(1)
            row.pop(1)


        #if college name is multiple words, combine into one, 
        for row in names:
            if len(row) == 7:
                row[-2] = row[-2] + ' ' + row[-1]
                row.pop()
            if len(row) == 8:
                row[-3] = row[-3] + ' ' + row[-2] + ' ' + row[-1]
                row.pop()
                row.pop()
        
        for player in names:
            d = {}
            player[0] = int(player[0])
            d['pick'] = player[0]
            d['team'] = player[1]
            d['name'] = player[2] + ' ' + player[3]
            d['position'] =  player[4]
            d['college'] = player[5]
            d['round'] = player[0]
            d['year'] = year

            if player[0] < lst[0]:
                d['round'] = 1
            elif player[0] < lst[1]:
                d['round'] = 2
            elif player[0] < lst[2]:
                d['round'] = 3
            elif player[0] < lst[3]:
                d['round'] = 4
            elif player[0] < lst[4]:
                d['round'] = 5
            elif player[0] < lst[5]:
                d['round'] = 6
            else:
                d['round'] = 7

            drafts.append(d)
    return drafts

In [None]:
drafts = draft_results(years)

In [None]:
#we only want the offensive positions, so we create a subset of out data
off = ['WR','RB','QB','TE']
off_draft = [x for x in drafts if x['position'] in off]

In [None]:
df = pd.DataFrame(off_draft)
df.to_csv('players.csv')

### Now to Grab player college statistics

In [None]:
driver = webdriver.Chrome()

In [None]:
def player_urls(years):
    '''
    get individual player urls
    '''
    urls = []
    for year in tqdm(years):
        time.sleep(random.choice([x/10 for x in range(4,9)]))
        response = requests.get(f'https://www.pro-football-reference.com/years/{year}/draft.htm')
        soup = BS(response.content, 'html.parser')
        x = soup.select('.right a')
        for player in x:
            urls.append(player['href'])
    return urls

In [None]:
urls = player_urs(years)

In [None]:
def college_stats(urls)
    #plug in urls from beautiful soup and scrape player data
    stats = []
    for url in tqdm(urls):
        try:
            time.sleep(random.choice([x/10 for x in range(4,11)]))
            test = []
            driver.get(url)
            player = driver.find_element_by_id('info').text.split('\n')
            position = [x for x in player if 'Position' in x]
            position = position[0].split()[1]
            if position == 'QB':
                #get QB passing stats
                passing = driver.find_element_by_id('passing').text.split('\n')
                #get QB rushing stats
                rushing = driver.find_element_by_id('rushing').text.split('\n')
                test.append([player,passing,rushing])

            elif position == 'WR' or position == 'TE':
                #get WR stats
                rec_rushing = driver.find_element_by_id('receiving').text.split('\n')
                test.append([player,rec_rushing])

            elif position == 'RB':
                #get RB rushing stats
                rushing = driver.find_element_by_id('rushing').text.split('\n')
                test.append([player,rushing])


            elif position != 'P' or position != 'K':
                #get defensive stats
                defense = driver.find_element_by_id('defense').text.split('\n')
                test.append([player, defense])

            lst.append(test)
        except:
            pass
    return stats

In [None]:
stats = college_stats(urls)

In [None]:
#split all the players by their positions, we will only work with wr and rb but all are avaialable
wr = []
qb = []
rb = []
de = []
te = []
for player in tqdm(stats):
    position = [x for x in player[0][0] if 'Position' in x]
    position = position[0].split()[1]
    if  'QB' in position:
        qb.append(player)
    elif 'RB' in position:
        rb.append(player)
    elif 'WR' in position :
        wr.append(player)
    elif 'TE' in position:
        te.append(player)
    else:
        de.append(player)

In [None]:
def clean_wr(wr):
    '''
    wide reciever parsing, we include the error list in case any players are missed
    '''
    wr_stats = []
    erros = []
    #wr player info parsing
    for player in tqdm(wr):
        d = {}
        for row in player[0][0]:
            d['name'] = player[0][0][0]
            if 'School' in row:
                d['school'] = row.split(':')[1].strip()
            if 'Position' in row:
                d['position'] = row.split()[1]
            if 'lb' in row:
                d['height'] = row.split(' ')[0][:-1]
                d['weight'] = row.split(' ')[1]


        #only create a row if the position is only WR
        if d['position'] == 'WR':
            wr_col = wr[0][0][1][1].split('Class')[1].split()
            for i in range(3,6):
                wr_col[i] +='_rec'
            for i in range(6,10):
                wr_col[i] +='_rush'
            wr_col = wr_col[:-4]
            print(d['name'])
            result = [x for x in player[0][1] if x.startswith('20') or x.startswith('*') or x.startswith('199')][-1]
            if 'SO' in result:
                result = result.split(' SO ')[1].split()
                d['class'] = 'SO'
            if 'JR' in result:
                result = result.split(' JR ')[1].split()
                d['class'] = 'JR'
            if 'SR' in result:
                result = result.split(' SR ')[1].split()
                d['class'] = 'SR'
            try:
                for i in range(len(wr_col)):
                    d[wr_col[i]] = result[i]
                wr_stats.append(d)
            except:
                erros.append(d)
                pass
    return wr_stats, errors

In [None]:
wr_stats, errors = clean_wr(wr)
wr_df = pd.DataFrame(wr_stats)

In [None]:
#drop 1 player that was scraped incorrectly by script and clean weight data
wr_df = pd.DataFrame(wr_stats)
wr_df['weight'] = wr_df['weight'].apply(lambda x: str(x)[:3])
wr_df = wr_df[wr_df['name'] != 'Keary Colbert']

In [None]:
#clean the height variable and convert to an inches
wr_df.height = wr_df['height'].apply(lambda x: str(x))
wr_df.height = wr_df.height.apply(lambda x: int(x.split('-')[0])*12 + int(x.split('-')[1]) if x[0] != 'n' else np.nan)

In [None]:
#convert remaining strings to integers

cols = ['Att_rush', 'Avg_rec', 'Avg_rush', 'G', 'Rec', 'TD_rec',
       'TD_rush', 'Yds_rec', 'Yds_rush', 'height','weight']

wr_df[cols] = wr_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
#split missing data for later scraping 
full_wr = wr_df[~wr_df.Att_rush.isna()]
missing_wr = wr_df[wr_df.Att_rush.isna()]

In [None]:
#merge together the previous dataframe with draft data and this player stat dataframe
final_wr = pd.merge(full_wr,df,on='name')

#### We are missing some data from a few players, we need to grab the Nan's from another online source

In [None]:
missing_wr = list(final_wr[final_wr.height.isna()].name)

In [None]:
def missing_ht_wt(missing_names):
    '''
    get the missing player height and weight data for wrs
    '''
    driver = webdriver.Chrome()
    driver.get('https://www.pro-football-reference.com/players/I/IgleJu00.htm')
    filled_missing = []
    for name in tqdm(missing_names):
        d = {}
        time.sleep(random.choice([x/10 for x in range(4,9)]))
        try:  

            search_bar = driver.find_element_by_xpath('//*[@id="header"]/div[3]/form/div/div/input[2]')
            search_bar.send_keys(name)
            search_bar.send_keys(Keys.ENTER)

            url = driver.current_url
            if 'search' in url:
                try:
                    if driver.find_element_by_xpath('//*[@id="players"]/div[1]/div[1]/a').get_attribute('href'):
                        link = driver.find_element_by_xpath('//*[@id="players"]/div[1]/div[1]/a').get_attribute('href')
                        driver.get(link)

                except:
                    time.sleep(random.choice([x/10 for x in range(2,6)]))
                    driver.get('https://www.pro-football-reference.com/players/I/IgleJu00.htm')
                    pass

            height = driver.find_element_by_xpath('//*[@id="meta"]/div/p[3]/span[1]').text
            weight = driver.find_elements_by_xpath('//*[@id="meta"]/div/p[3]/span[2]')[0].text
            d['name'] = name
            d['weight'] = weight
            d['height'] = height

            filled_missing.append(d)
        except:
            time.sleep(random.choice([x/10 for x in range(2,6)]))
            driver.get('https://www.pro-football-reference.com/players/I/IgleJu00.htm')
            pass
    return filled_missing

In [None]:
missing_wr = missing_ht_wt(missing_wr)

In [None]:
def clean_ht_wt(missing):
    '''
    clean weight and convert height to inches
    '''
    for player in missing:
        if 'lb' in player['weight']:
            player['weight'] = int(player['weight'][:3])
        if '-' in player['height']:
            player['height'] = int(player['height'][0])*12 + int(player['height'][2:])
    return missing

In [None]:
missing_wr = clean_ht_wt(missing_wr)

In [None]:
def fillin_ht_wt(missing,dframe):
    '''
    fill the missing heights and weights
    '''
    for player in tqdm(missing):
        if type(player['weight']) == int:
            dframe.loc[dframe['name'] == player['name'],'height'] = int(player['height'])
            dframe.loc[dframe['name'] == player['name'],'weight'] = int(player['weight'])

In [None]:
fillin_ht_wt(missing_wr, final_wr)

In [None]:
final_wr.dropna(inplace=True)

In [None]:
final_wr.to_csv('final_wr.csv')

#### Running Back Statistics Parsing

In [None]:
def clean_rb(rb):
    '''
    Running Back parsing, we include the error list in case any players are missed
    '''
rb_stats = []
erros = []
#rb player info parsing
for player in tqdm(rb):
    d = {}
    for row in player[0][0]:
        d['name'] = player[0][0][0]
        if 'School' in row:
            d['school'] = row.split(':')[1].strip()
        if 'Position' in row:
            d['position'] = row.split()[1]
        if 'lb' in row:
            d['height'] = row.split(' ')[0][:-1]
            d['weight'] = row.split(' ')[1]
    
    
    #only create a row if the position is only RB
    if d['position'] == 'RB':
        rb_col = rb[0][0][1][1].split('Class')[1].split()
        for i in range(3,6):
            rb_col[i] +='_rec'
        for i in range(6,10):
            rb_col[i] +='_rush'
        rb_col = rb_col[:-4]
        print(d['name'])
        result = [x for x in player[0][1] if x.startswith('20') or x.startswith('*') or x.startswith('199')][-1]
        if 'SO' in result:
            result = result.split(' SO ')[1].split()
            d['class'] = 'SO'
        if 'JR' in result:
            result = result.split(' JR ')[1].split()
            d['class'] = 'JR'
        if 'SR' in result:
            result = result.split(' SR ')[1].split()
            d['class'] = 'SR'
        try:
            for i in range(len(rb_col)):
                d[rb_col[i]] = result[i]
            rb_stats.append(d)
        except:
            erros.append(d)
            pass
    return rb_stats, errors

In [None]:
rb_stats, errors = clean_rb(rb)

In [None]:
#convert to dataframe then search for missing data and clean
rb_df = pd.DataFrame(rb_stats)
missing_rb = list(rb_df[rb_df['height'].isna()]['name'])

#scrape clean and fill missing wr_data
filled_rb = missing_ht_wt(missing_rb)
filled_rb = clean_ht_wt(filled_rb)

In [None]:
#fill in missing value in rb_df
fillin_ht_wt(filled_rb,rb_df)

In [None]:
#merge with draft data
final_rb = pd.merge(rb_df,df,on='name')

In [None]:
#clean height column
final_rb = final_rb[final_rb['height'] !='Kolb']
final_rb.height = final_rb['height'].apply(lambda x: str(x))
final_rb.height = final_rb.height.apply(lambda x: int(x.split('-')[0])*12 + int(x.split('-')[1]) if '-' in x else x)
final_rb = final_rb[final_rb['height'] != 'nan']

In [None]:
#convert to intergers 
final_rb['height'] = final_rb['height'].apply(lambda x: int(x))
final_rb['weight'] = final_rb['weight'].apply(lambda x: int(x[:3]) if type(x) == str  else x)

In [None]:
#convert continuous columns to ints
cols = ['Att', 'Avg_rec', 'Avg_rush', 'G', 'Rec_rush', 'TD_rec',
       'TD_rush', 'height','weight']

final_rb[cols] = final_rb[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
final_rb.to_csv('final_rb.csv')

# Add in combine data

In [None]:
#load in the nfl combine csv
df = pd.read_csv('combine_data_since_2000_PROCESSED_2018-04-26.csv')
df.columns = ['name', 'Pos', 'Ht', 'Wt', 'Forty', 'Vertical', 'BenchReps',
       'BroadJump', 'Cone', 'Shuttle', 'Year', 'Pfr_ID', 'AV', 'Team', 'Round',
       'Pick']

In [None]:
#rest of the columns are mainly Null, so use these following
df = df.loc[:,['name','Forty','Vertical','BroadJump']]

In [None]:
#merge together the combine and game data
rb = pd.merge(final_rb,df,on='name')
wr = pd.merge(final_wr,df,on='name')

In [None]:
#drop duplicate and multicollinear columns
rb.drop(['Unnamed: 0','Pos','Yds_rec','Yds_rush','position_x'],axis=1,inplace=True)
wr.drop(['Unnamed: 0','Pos','Yds_rec','Yds_rush','position_x'],axis=1,inplace=True)

In [None]:
#combine wr and rb player data, then drop columns due to null values and irrelevant data
final_players = pd.concat([wr,rb],axis=0)
final_players.drop(['Vertical','BroadJump'],axis=1,inplace=True)
final_players.drop(['name','pick','team'],axis=1,inplace=True)

##### Now to aqcquire each players NCAA conference

In [None]:
#scrape the colleges that correspond to each NCAA conference
driver = webdriver.Chrome()
driver.get('https://www.espn.com/college-football/teams')

divisions = []
for x in driver.find_elements_by_class_name('headline')[1:]:
    divisions.append(x.text)

teams = []
for x in driver.find_elements_by_class_name('mt7'):
    teams.append([y.text for y in x.find_elements_by_class_name('h5')])

team_div_dict = dict(zip(divisions,teams))

d = {}
for i,j in team_div_dict.items():
    for team in j:
        d[' '.join(team.split()[:-1])] = i

In [None]:
def clean_location_helper(strng):
    school_scraped = list(d.keys())                                           #INT'L Locations
    
    loc_vals_list = list(map(lambda x: textdistance.jaro_winkler.normalized_distance(x, strng), school_scraped))
    loc_dict = dict(list(zip(school_scraped, loc_vals_list)))
    result = (min(loc_dict.values()), min(loc_dict, key=loc_dict.get))
    return result[1]


def clean_location():
    lowered_list = [x.lower() for x in final_players['college'].unique()]
    cache = dict(list(zip(lowered_list, [clean_location_helper(x) for x in lowered_list])))
    final_players.college = final_players.college.str.lower()
    final_players.college = final_players.college.map(cache)

In [None]:
#clean the schools and then map each school to a division
clean_location()
final_players['college'] = final_players['college'].apply(lambda x: d[x])

In [None]:
#bin the year column and drop unneccesary columns and others due to multicollinearity
final_players.loc[:,'years'] = pd.cut(final_players.year,[2000,2006,2011,2016,2020])
final_players.drop(['school','position','Rec','G','college','years'],axis=1,inplace=True)

In [None]:
#write the final dataframe to csv
final_players.to_csv('final_players.csv')

In [None]:
#connect to SQL Database and create a table
DB_NAME = 'NFL_DRAFT'
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.passwd,
    database = DB_NAME)

cursor = cnx.cursor()

#create table
query = """ CREATE TABLE players (
            id INT NOT NULL AUTO_INCREMENT,
            Att_rush    FLOAT(6,3),
            Avg_rec     FLOAT(6,3),
            Avg_rush    FLOAT(6,3),
            G           FLOAT(6,3),
            Rec         FLOAT(6,3),
            TD_rec      FLOAT(6,3),
            TD_rush     FLOAT(6,3),
            class       VARCHAR(15),
            height      FLOAT(6,3),
            weight      FLOAT(6,3),
            college     VARCHAR(50),
            position    VARCHAR(5),
            round       INT(5),
            years       VARCHAR(25),
            Forty       FLOAT(6,3),
            PRIMARY KEY (id))"""
cursor.execute(query)
cnx.commit()

In [None]:
df_dict = df.to_dict('records')

In [None]:
tuple_lst = []
for dic in df_dict:
    tuple_lst.append((float(dic['Att_rush']), float(dic['Avg_rec']), float(dic['Avg_rush']), float(dic['G']),
                    float(dic['Rec']), float(dic['TD_rec']), float(dic['TD_rush']), str(dic['class']),
                    float(dic['height']), float(dic['weight']), str(dic['college']), str(dic['position']),
                    int(dic['round']), str(dic['years']), float(dic['Forty'])
                      ))
#setup SQL database connection and update database
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.passwd,
    database = DB_NAME
)
cursor = cnx.cursor()

#create INSERT statement
stmt = '''INSERT INTO players (Att_rush, Avg_rec, Avg_rush, G, Rec, TD_rec, TD_rush,
                            class, height, weight, college, position, round, years, Forty)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
cursor.executemany(stmt, tuple_lst)

#commit insert
cnx.commit()

# Modeling

In [None]:
#if you reset your notebook you can load it from csv here
#final_players = pd.read_csv('final_players.csv')

#or you can load it from SQL database here

# cnx = mysql.connector.connect(
#     host = config.host,
#     user = config.user,
#     passwd = config.passwd,
#     database = DB_NAME)

# cursor = cnx.cursor()

# cursor.execute('''SELECT * FROM players''')

# final_players = pd.DataFrame(cursor.fetchall())
# final_players.columns = [x[0].lower() for x in cursor.description]

In [None]:
def confusion_matrix_graph(cnf_matrix):
    '''
    Graphs the confusion matrix in a cleaner format
    '''
    plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) #Create the basic matrix.

    #Add title and Axis Labels
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    #Add appropriate Axis Scales
    class_names = set(y) #Get class labels to add to matrix
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    #Add Labels to Each Cell
    thresh = cnf_matrix.max() / 2. #Used for text coloring below
    #Here we iterate through the confusion matrix and append labels to our visualization.
    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
            plt.text(j, i, cnf_matrix[i, j],
                     horizontalalignment="center",
                     color="white" if cnf_matrix[i, j] > thresh else "black")

    #Add a Side Bar Legend Showing Colors
    plt.colorbar()

In [None]:
#look at distribution of rounds each position was drafted in
sns.countplot('round',data=df,hue='position');
plt.title('Round Drafted by Position')

In [None]:
#look at the scatter plots of all continuous variables
plt.style.use('ggplot')
cols = ['Att_rush', 'Avg_rec', 'Avg_rush', 'Rec', 'TD_rec', 'TD_rush','height','weight','Forty']
for col in cols:
    sns.lmplot(col, 'round',data=df,hue='position')
    plt.show()

In [None]:
#create dummy variables
df = pd.get_dummies(df,drop_first=True)

In [None]:
X = df.drop(['round'],axis=1)
y = df['round'].apply(lambda x: 1 if x < 3 else 0)

x_train, x_test,y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2)

In [None]:
#use smote to balance the class sizes
smt = SMOTE()
x_train, y_train = smt.fit_sample(x_train, y_train)

###### Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=100,random_state=2)
clf.fit(x_train,y_train)

In [None]:
#predct and show classification report
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))

In [None]:
#area under ROC curve
fpr,tpr,thresh = roc_curve(y_test,y_pred)
auc(fpr,tpr)

In [None]:
#Now with grid search
val = cross_val_score(RandomForestClassifier(n_estimators=100),X,y,cv=4)
val.mean()

In [None]:
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16],
    "max_features": [None,4,5,6,9,10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf" : [1, 2, 3, 5, 6],
    "n_estimators" : [10, 30, 100]
}


In [None]:
#run the grid searh and print out the best parameters to use
start = time.time()
rf_grid_search = GridSearchCV(clf,rf_param_grid,cv=3,verbose=1)
rf_grid_search.fit(X, y)


print("Testing Accuracy: {:.4}%".format(rf_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Random Forest Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(rf_grid_search.best_params_))

In [None]:
#then fit according to the output
clf = RandomForestClassifier(criterion='gini',max_depth=16,max_features=6,min_samples_leaf=6,min_samples_split=2,n_estimators=1000,random_state=2)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))

In [None]:
fpr,tpr,thresh = roc_curve(y_test,y_pred)
auc(fpr,tpr)

In [None]:
# graphs the ROC Curve
rf_probs = clf.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, rf_probs)

sns.set_style("darkgrid", {"axes.facecolor": ".9"})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='mediumorchid',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='lightskyblue', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
def plot_feature_importances(model):
    '''
    Plots the importance of each feature depending how often it is split on
    '''
    n_features = x_train.shape[1]
    plt.figure(figsize=(10,10))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
plot_feature_importances(clf)

###### Decision Tree

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))

In [None]:
#print out the decision tree
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [None]:
#acquire the area under ROC 
fpr,tpr,thresh = roc_curve(y_test,y_pred)
auc(fpr,tpr)

###### Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=2)
gbc.fit(x_train,y_train)
y_pred = gbc.predict(x_test)
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))

In [None]:
fpr,tpr,thresh = roc_curve(y_test,y_pred)
auc(fpr,tpr)

##### Models that need to use Interaction

In [None]:
#create interaction variables to allow WR and RB to have different slopes
#only use this for logistic regression as RF alreadys takes this into account
df_interact = deepcopy(df)
for col in cols:
    df_interact[col + '* position_WR'] = df[col]*df['position_WR']

In [None]:
X = df_interact.drop(['round','position_WR'],axis=1)
y = df_interact['round'].apply(lambda x: 1 if x < 3 else 0)

x_train, x_test,y_train, y_test = train_test_split(X,y,test_size=.2,random_state=2)

In [None]:
#use smote to balance the class sizes
smt = SMOTE()
x_train, y_train = smt.fit_sample(x_train, y_train)

###### Logistic Regression

In [None]:
#smoted with interactions
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C = 1e12, solver='liblinear')
log_reg.fit(x_train,y_train)
y_pred = log_reg.predict(x_test)
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))

In [None]:
#ROC
fpr,tpr,thresh = roc_curve(y_test,y_pred)
auc(fpr,tpr)

In [None]:
# Graph ROC curve
rf_probs = log_reg.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, rf_probs)

sns.set_style("darkgrid", {"axes.facecolor": ".9"})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='mediumorchid',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='lightskyblue', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

###### SVM

In [None]:
svc = SVC(C=1,kernel='rbf')
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred))
confusion_matrix_graph(confusion_matrix(y_test,y_pred))