# Import Libraries

In [None]:
# General libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import time
import os 
import datetime

# Webscraping
import requests
from bs4 import BeautifulSoup 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 

# Cleaning & Modelling function made specifically
from scraping_funcs import *
from modelling_funcs import *

# Modelling
import patsy
import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

%matplotlib inline

---

# Obtain Data

## England Statistics Data
- Source: ESPN Stats Guru

In [None]:
page_nums = list(range(0,6)) # 5 pages of data on website 
rugby_stats = get_stats(page_nums)

with open('rugby_stats.pickle', 'wb') as to_write:
    pickle.dump(rugby_stats, to_write)

---

## World Rankings Data
- Source: World Rugby

In [None]:
# Create list of Years from 2004 - 2019
my_years = list(range(2004,2020))

In [None]:
dfs = []

for year in my_years:
    
    # Use selenium to access each year of rankings on website
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    
    # Open webpage
    driver = webdriver.Chrome(chromedriver)
    driver.get("https://www.world.rugby/rankings/mru#!")
    
    # Close cookies pop-up
    element = driver.find_element_by_class_name("js-cookie-accept-button")
    element.click()
    time.sleep(2)
    
    # Locate year drop down
    element = driver.find_element_by_xpath('//div[@data-date-drop="year"]')
    element.click()
    time.sleep(2)
    
    # Locate Year from list of years
    element = driver.find_element_by_xpath('//li[@value={}]'.format(str(year)))
    element.click()
    time.sleep(2)
    
    # Access'Full Rankings'
    element = driver.find_element_by_class_name("showMore")
    element.click()
    
    # Use BS to parse html
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    soup.find("fullRankings")
    soup.find(class_="fullRankings")
    
    # Create dataframe of World Rankings
    rankings_df = pd.read_html(str(soup.find(class_="fullRankings")))[0]
    
    # Clean dataframe using function 
    clean = pd.DataFrame(clean_dataframe(rankings_df))
    
    # Append to list
    dfs.append(clean)
    
    # Close Window
    driver.close()

# Return 1 dataframe for all World Ranking Data & save for later
final_rankings_df = pd.concat(dfs)
with open('final_rankings_df.pickle', 'wb') as to_write:
    pickle.dump(final_rankings_df, to_write)

---

## Coach Data 
- Source: Wikipedia

In [None]:
url = "https://en.wikipedia.org/wiki/England_national_rugby_union_team"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

# Locate correct table & headers
tables = soup.find_all("table", class_="sortable wikitable")
coach_table = tables[1]
headers_list = header_list(coach_table)

# Locate data for each row in table & clean list
td_values = tables[1].find_all('td')
list_vars = [i.text for i in td_values]
list_vars = clean_list(list_vars)

# Group every 7 records into 1 item in a list
records = lambda list_vars, n=7: [list_vars[i:i+n] for i in range(0, len(list_vars), n)]
coaches_list = records(tuple(list_vars))

# Create dataframe of Coaches from header_list and coaches_list
coaches_df = pd.DataFrame(coaches_list, columns=headers_list)
coaches_df = clean_coaches(coaches_df)
with open('coaches_df.pickle', 'wb') as to_write:
    pickle.dump(coaches_df, to_write)

---

# Assumptions & Alterations

## Assumptions
- As World Rankings data from 2000-2004 is unavailable, I'll make the assumption that the ranking prior to 2004 is the 2004 ranking.

In [None]:
# Need to change 'Year' column to merge correct and assumed World Rankings
rugby_stats["Year"] = rugby_stats["Year"].apply(change_year)
rugby_stats['Year'] = rugby_stats['Year'].astype(int)

In [None]:
# Create dictionary of England World Ranking each for each Year
england_rank = list(final_rankings_df.loc[final_rankings_df['Countries'] == "England"].Position)
england_rank = dict(zip(my_years, england_rank))

In [None]:
# Create new 'England World Rank' column on Rankings df - depenedent on the year of the match
def eng_rank(year):
    return england_rank[year]
final_rankings_df["EnglandWorldRank"] = final_rankings_df["Year"].apply(eng_rank).astype(int)

---

## Merge Statistics and World Ranking Dataframes

In [None]:
# Merge and remove Year - as this is only used to correctly merge the world ranking data
stats_rank = rugby_stats.merge(final_rankings_df, on=["Year", "Countries"])
stats_rank.drop(['Year'], axis=1, inplace=True)
stats_rank.head()

---

## Add Coach Win Percentage Data

- Merge Coach Win Percentage Data dependent on Match Date in range of each coach's tenure.

In [None]:
# Create Coach win percentage dependent on the years that the coach's tenure
stats_rank = stats_rank.assign(key=1)
coaches_df = coaches_df.assign(key=1)
rugby = pd.merge(stats_rank, coaches_df, on='key').drop('key',axis=1)
rugby = rugby.query('MatchDate >= start and MatchDate <= end')

---

## Sort Columns & Shift Statistics

In [None]:
# Tidy up columns
rugby.drop(['index', 'Name', 'start', 'end'], axis=1, inplace=True)

In [None]:
# Split date into Month and Year column
def date_to_month(date):
    month = (datetime.datetime.strptime(date, "%Y-%m-%d")).month
    return month

def date_to_year(date):
    year = (datetime.datetime.strptime(date, "%Y-%m-%d")).year
    return year

rugby["Month"] = rugby["MatchDate"].astype(str).apply(date_to_month)
rugby["Year"] = rugby["MatchDate"].astype(str).apply(date_to_year)

In [None]:
# Sort by date & add statistics columns
rugby = rugby.sort_values('MatchDate').reset_index(drop=True)

rugby["Average"] = rugby['For'].expanding().mean().round(2).shift(1) # Create an expanding mean to update the average score for each match
rugby[["ConceededPrevGame", "TriesScoredPrevGame", "ConversionsPrevGame"]] = (rugby[["Aga", "Tries", "Conv"]].shift(1))

rugby = rugby.dropna()
rugby[["Position", "ConceededPrevGame", "TriesScoredPrevGame", "ConversionsPrevGame", "Winpercent"]] = (rugby[["Position", "ConceededPrevGame", "TriesScoredPrevGame", "ConversionsPrevGame", "Winpercent"]].astype(int))
rugby.drop(['Aga', 'Tries', 'Conv', 'MatchDate'], axis=1, inplace=True)

# Rename Columns 
rugby.rename(columns={'For': 'Score', 'Countries': 'Opponent', 'Position': 'OpponentWorldRanking', 'Average': 'AverageScorePriorToMatch'}, inplace=True)

# Final dataframe to work with
with open('rugby.pickle', 'wb') as to_write:
    pickle.dump(rugby, to_write)

---

# Modelling

## Feature Assessment

In [None]:
# assess the features compated to each other - check correlation to reduce multi-collinearity
plt.figure(figsize=(8,6))
sns.heatmap(rugby.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

In [None]:
sns.pairplot(rugby, diag_kind="kde");

- **Initial Observations:**
- Opponent World Rank - biggest positive influencer on score.
- Tries prev Game and Conversions prev Game very highly correlated
- Score, Opponent World Ranking, Tries/Conv/Conceeded Prev Game all right skewed.

In [None]:
# look at Tries vs Conversions in more detail
X = rugby[['TriesScoredPrevGame']]
y = rugby[['ConversionsPrevGame']]

model = sm.OLS(y,X)
fit = model.fit()
fit.summary()

- r2 of 0.936 and p-val of 0
- Remove to reduce multicollinearity

In [None]:
# Conversions only get the opportunity once a try has been scored - therefore if conversion rate is high, then they
# are likely to capture to similiar data - high points = high conversions
rugby.drop(['ConversionsPrevGame'], axis=1, inplace=True)

---

## Regression 1 - Linear Regression (all features)

In [None]:
# Get all dummy variables
rugby1 = rugby.join(pd.get_dummies(rugby[['Opponent', 'Where']], drop_first=True))
rugby1.drop(['Opponent', 'Where'], axis=1, inplace=True)

In [None]:
X, y = rugby1.drop("Score", axis=1), rugby1["Score"]
X_train, X_hold_out, y_train, y_hold_out = train_test_split(X, y, test_size=.2, random_state=27) # 80% Train 20% Test
kf = KFold(n_splits=5, shuffle=True, random_state = 27)

In [None]:
reg_plot(X_train, y_train)

- Residuals follow QQ plot, however outliers may be affecting residual plot

In [None]:
lin_reg(rugby1, "Score")

- **RESULT**: Overfitting
- **ACTION**: Try Ridge and LASSO Regressions to find most important features

In [None]:
columns = list(rugby1.drop("Score", axis=1))
ridge_reg(X_train, y_train, columns)

In [None]:
lasso_reg(X_train, y_train, columns)

- **RESULT:** Both models still overfitting, LASSO zeroed EnglandWorldRank, TriesScoredPrevGame, ConceededPrevGame and a lot of the opponents.
- **ACTION:** Remove these columns and try Linear Regression again. 

---

## Regression 2 - Linear Regression - dropped features determined by LASSO

In [None]:
# Get dummies and remove columns 
rugby2 = rugby.join(pd.get_dummies(rugby[['Opponent', 'Where']], drop_first=True))
rugby2.drop(['EnglandWorldRank', 'ConceededPrevGame','TriesScoredPrevGame', 'Opponent_Australia', 'Opponent_France', 'Opponent_Japan', 'Opponent_NewZealand', 'Opponent_Samoa', 'Opponent_Scotland', 'Opponent_SouthAfrica', 'Opponent_Tonga','Opponent_USA','Where','Opponent'], axis=1, inplace=True)

In [None]:
X, y = rugby2.drop("Score", axis=1), rugby2["Score"]
X_train, X_hold_out, y_train, y_hold_out = train_test_split(X, y, test_size=.2, random_state=27) # 80% Train 20% Test
kf = KFold(n_splits=5, shuffle=True, random_state = 27)

In [None]:
reg_plot(X_train, y_train)

- Residuals a little more random and QQ residuals following QQ plot

In [None]:
lin_reg(rugby2, "Score")

- **RESULT**: Decreased different in r2 values, but model still overfitting.
- **ACTION**: Try LASSO and Ridge again.

In [None]:
columns = list(rugby2.drop("Score", axis=1))

In [None]:
lasso_reg(X_train, y_train, columns)

In [None]:
ridge_reg(X_train, y_train, columns)

- **RESULT**: Both models still overfitting, Lasso has zeroed some more opponents.
- **ACTION**: Simplify even more - remove all the opponents.

---

## Regression 3 - Simplified Linear Regression

In [None]:
rugby3 = rugby.join(pd.get_dummies(rugby['Where'], drop_first=True))
rugby3.drop(['EnglandWorldRank', 'AverageScorePriorToMatch','Opponent', 'Where'], axis=1, inplace=True)

In [None]:
X, y = rugby3.drop("Score", axis=1), rugby3["Score"]
X_train, X_hold_out, y_train, y_hold_out = train_test_split(X, y, test_size=.2, random_state=27) # 80% Train 20% Test
kf = KFold(n_splits=5, shuffle=True, random_state = 27)

In [None]:
reg_plot(X_train, y_train)

In [None]:
lin_reg(rugby3, 'Score')

- **RESULT:** Better generalising but still 1.2 between r2s
- **ACTION:** Ridge and LASSO again.

In [None]:
columns = list(rugby3.drop("Score", axis=1))

In [None]:
ridge_reg(X_train, y_train, columns)

In [None]:
lasso_reg(X_train, y_train, columns)

- **RESULT:** Still overfitting. 
- **ACTION:** Go back and transform score.

---

## Regression 4 - Transform score using sqrt

In [None]:
rugby4 = rugby3

In [None]:
# Transform score
rugby4["sqrtscore"] = np.sqrt(rugby4["Score"])
rugby4.drop("Score", axis=1, inplace=True)
plt.hist(rugby4["sqrtscore"]);

In [None]:
X, y = rugby4.drop("sqrtscore", axis=1), rugby4["sqrtscore"]
X_train, X_hold_out, y_train, y_hold_out = train_test_split(X, y, test_size=.2, random_state=27) # 80% Train 20% Test
kf = KFold(n_splits=5, shuffle=True, random_state = 27)

In [None]:
reg_plot(X_train, y_train)

In [None]:
lin_reg(rugby4, 'sqrtscore')

- **RESULT:** r2 difference 0.94 but still overfitting.
- **ACTION**: LASSO and Ridge

In [None]:
columns = list(rugby4.drop("sqrtscore", axis=1))

In [None]:
ridge_reg(X_train, y_train, columns)

In [None]:
lasso_reg(X_train, y_train, columns)

- **RESULT:** LASSO zeroed ConceededPrevGame and TriesScoredPrevGame - but less overfitting 0.5 difference in r2
- **ACTION:** Remove ConceededPrevGame and TriesScoredPrevGame

---

## Regression 5  - Simplified features and score sqrted

In [None]:
rugby5 = rugby4

In [None]:
rugby5.drop(["TriesScoredPrevGame", "ConceededPrevGame"], axis=1, inplace=True)

In [None]:
X, y = rugby5.drop("sqrtscore", axis=1), rugby5["sqrtscore"] # Full dataset

X_train, X_hold_out, y_train, y_hold_out = train_test_split(X, y, test_size=.2, random_state=27) # 80% Train 20% Test

kf = KFold(n_splits=5, shuffle=True, random_state = 27)

In [None]:
reg_plot(X_train, y_train)

- Little change in residiual plot and QQ plot from rugby4

In [None]:
lin_reg(rugby5, 'sqrtscore')

- Slight change in difference between r2 - 0.88 - still overfitting

In [None]:
columns = list(rugby5.drop("sqrtscore", axis=1))

In [None]:
ridge_reg(X_train, y_train, columns)

In [None]:
lasso_reg(X_train, y_train, columns)

- **RESULT:** LASSO no longer zeroed any more features and r2's remained the same as rugby4. Ridge model still overfits. LASSO gives best alpha as 0.1
- **ACTION:** LASSO gives model - run on test data


---

## Run Model on Test Data

In [None]:
# Scale test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_hold_out_scaled = scaler.transform(X_hold_out)

In [None]:
# Run model & print coefficients
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_scaled, y_train)
list(zip(columns, (lasso_model.coef_).round(2)))

In [None]:
pred = lasso_model.predict(X_hold_out_scaled)

In [None]:
r2_score(y_hold_out**2, pred**2)

In [None]:
# Undo sqrt score
y_hold2 = y_hold_out **2
pred2 = pred **2

# Calculate RMSE
error = y_hold2 - pred2
sumerr = (error**2).sum()
mean = sumerr / len(y_hold_out)
print(np.sqrt(mean).round(2))

In [None]:
# Plot predicted score against actual score
plt.figure(figsize=(9,6))
plt.scatter(pred**2, y_hold_out**2, alpha=0.6);
plt.ylabel('Actual Score', size=14)
plt.xlabel('Predicted Score', size=14)
plt.title('Actual Score of Match against Model Predicted Score', size=14)
plt.plot(np.linspace(0, 70, 10), np.linspace(0, 70, 10), color='r')
plt.savefig('model.png');

---

# Make Prediction

In [None]:
# Create dataframe from upcoming match data & set predictor
englanddict = {'OpponentWorldRanking': 2, 'CoachWinPerc': 80, 'Month' : 2,'Year' : 2019, 'Home': 0, 'sqrtscore': '?'}
predictdf = pd.DataFrame([englanddict], columns=englanddict.keys())

In [None]:
X_predictor, y_predictor = predictdf.drop("sqrtscore", axis=1), predictdf["sqrtscore"] 

In [None]:
# Scale X data
X_train_scaled = scaler.fit_transform(X_train)
X_pred_scaled = scaler.transform(X_predictor)

In [None]:
pred2 = float(lasso_model.predict(X_pred_scaled))
score = pred2**2
print(f"Predicted Score: {score}")

---

# Presentation

In [None]:
# England Score Distribution
plt.figure(figsize=(9,6))
plt.hist(rugby["Score"]);
plt.title("England Score Distribution since 2000", size=14)
plt.xlabel("England Score", size=14)
plt.ylabel("Number of Games with Score", size=14);
plt.savefig("scorehist.png")

---