In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape data about [UGA Football Record](https://www.sicemdawgs.com/uga-football-records/vs-all-opponents/) from `https://www.sicemdawgs.com/uga-football-records/vs-all-opponents/`

- What is the first team that UGA played against
- What are the most challenging opponents?

In [2]:
response = requests.get("https://www.sicemdawgs.com/uga-football-records/vs-all-opponents/")

In [3]:
response.status_code

200

In [11]:
soup = BeautifulSoup(response.text, 'html')

In [12]:
rows = soup.find_all('tr')

In [27]:
def process_row(row):
    try:
        values = row.find_all('td')
        opponent = values[0].text
        firstgame = values[1].text
        lastgame = values[2].text
        games = values[3].text
        win = values[4].text
        lost = values[5].text
        tie = values[6].text
        return {"Opponent":opponent, "First Game":firstgame, "Last Game":lastgame,
                "G": games, "W":win, "L": lost, "T":tie}
    except Exception as error:
        print("Cannot process row " + str(row))
        return None
    

In [61]:
data = [process_row(row) for row in rows]

Cannot process row <tr><th class="schgry1">Opponent</th><th class="schgry2">First Game</th><th class="schgry2">Last Game</th><th class="schgry2">G</th><th class="schgry2">W</th><th class="schgry2">L</th><th class="schgry2">T</th></tr>
Cannot process row <tr><td class="ros2" colspan="7">**Georgia discredits losses to Georgia Tech in 1943 (48-0) and 1944 (44-0) due to WWII.</td></tr>
Cannot process row <tr><td class="ros2" colspan="7">**Formerly known as Oklahoma A&amp;M.</td></tr>


In [62]:
data = [d for d in data if d is not None]

In [63]:
df = pd.DataFrame(data)

In [64]:
df.dtypes

First Game    object
G             object
L             object
Last Game     object
Opponent      object
T             object
W             object
dtype: object

In [65]:
df=df[["Opponent", "First Game", "Last Game", "G", "W", "L", "T"]].set_index("Opponent")

In [66]:
df.dtypes

First Game    object
Last Game     object
G             object
W             object
L             object
T             object
dtype: object

In [68]:
df = df.applymap(pd.to_numeric) #convert all values to numbers

In [69]:
df.head()

Unnamed: 0_level_0,First Game,Last Game,G,W,L,T
Opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,1895,2018,69,25,40,4
Alabama Presbyterian,1911,1913,2,2,0,0
Appalachian State,2013,2017,2,2,0,0
Arizona,1985,1985,1,0,0,1
Arizona State,2008,2009,2,2,0,0


In [75]:
#What is the first team that UGA played against
first_year = df.sort_values("First Game").iloc[0, 0]
df[df['First Game']==first_year]

Unnamed: 0_level_0,First Game,Last Game,G,W,L,T
Opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Auburn,1892,2018,123,59,54,8
Mercer,1892,1941,22,22,0,0


In [76]:
list(df[df['First Game']==first_year].index)

['Auburn', 'Mercer']

In [77]:
# What are the most challenging opponents?
# define challenging as hard to win against
df['difficulty'] = df['L'] / (df['W'] + df['L'])
df.sort_values('difficulty', ascending=False)

Unnamed: 0_level_0,First Game,Last Game,G,W,L,T,difficulty
Opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Holy Cross,1937,1939,3,0,3,0,1.000000
Harvard,1921,1921,1,0,1,0,1.000000
Rice,1936,1936,1,0,1,0,1.000000
Chicago,1922,1922,1,0,1,0,1.000000
Navy,1916,1957,2,0,2,0,1.000000
Southern California,1931,1960,3,0,3,0,1.000000
Stanford,1978,1978,1,0,1,0,1.000000
Syracuse,1989,1989,1,0,1,0,1.000000
"Miami, Ohio",1974,1974,1,0,1,0,1.000000
Cumberland,1905,1905,1,0,1,0,1.000000


In [78]:
df.sort_values(['G', 'difficulty'], ascending=False)

Unnamed: 0_level_0,First Game,Last Game,G,W,L,T,difficulty
Opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Auburn,1892,2018,123,59,54,8,0.477876
Georgia Tech,1893,2018,111,67,39,5,0.367925
Florida,1904,2018,97,52,43,2,0.452632
Vanderbilt,1893,2018,79,57,20,2,0.259740
Kentucky,1939,2018,72,59,11,2,0.157143
South Carolina,1894,2018,71,51,18,2,0.260870
Alabama,1895,2018,69,25,40,4,0.615385
Clemson,1897,2014,64,42,18,4,0.300000
Tennessee,1899,2018,48,23,23,2,0.500000
Ole Miss,1940,2016,46,32,13,1,0.288889


In [79]:
# or weigh difficulty by the number of games
df['challenging'] = df['G'] * df['difficulty']
df.sort_values('challenging', ascending=False)

Unnamed: 0_level_0,First Game,Last Game,G,W,L,T,difficulty,challenging
Opponent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Auburn,1892,2018,123,59,54,8,0.477876,58.778761
Florida,1904,2018,97,52,43,2,0.452632,43.905263
Alabama,1895,2018,69,25,40,4,0.615385,42.461538
Georgia Tech,1893,2018,111,67,39,5,0.367925,40.839623
Tennessee,1899,2018,48,23,23,2,0.500000,24.000000
Vanderbilt,1893,2018,79,57,20,2,0.259740,20.519481
Clemson,1897,2014,64,42,18,4,0.300000,19.200000
South Carolina,1894,2018,71,51,18,2,0.260870,18.521739
LSU,1928,2018,31,13,17,1,0.566667,17.566667
Ole Miss,1940,2016,46,32,13,1,0.288889,13.288889
