## Intialize notebook

In [1]:
import pandas as pd
import re
import requests

## Define custom functions

In [2]:
def create_name_position_key(name: str, team: str, position: str) -> str:
    """Attempts to create a unique key for each player"""
    if position == "D":
        return f"{team}-dst"

    names = [n.lower().strip() for n in name.split(",")]
    names = [re.sub("[^a-z ]+", "", n) for n in names]
    last_name = names[0].split(" ")[0]
    first_name = names[-1]
    return f"{first_name}-{last_name}-{position.lower()}"

## Get data

In [3]:
url = "http://rotoguru1.com/cgi-bin/fstats.cgi"
params = {
    "pos": 0,
    "sort": 5,
    "game": "p",
    "colA": 0,
    "daypt": 0,
    "xavg": 1,
    "inact": 0,
    "maxprc": 99999,
    "outcsv": 1,
}
r = requests.get(url, params=params)
r.status_code

200

## Convert raw data into dataframe

In [4]:
# split on newlines
split_newline = r.text.split("\n")

# if the first 4 characters in a line are digits, then it's player data (GID)
players = [line for line in split_newline if line[:4].isdigit()]

print("number of players scraped:", len(players))

# combine the headers with each row of data
headers = [
    "gid",
    "position",
    "name",
    "team",
    "opponent",
    "home/away",
    "salary",
    "salary_change",
    "total_points",
    "games_played",
    "points_per_game",
    "points_per_game_per_salary",
    "points_per_game_alt",
    "bye_week",
    "ytd_salary_high/low",
]
data = []
for row in players:
    split_row = row.split(";")
    d = {k: v for k, v in zip(headers, split_row)}
    d["name_position_key"] = create_name_position_key(
        d["name"], d["team"], d["position"]
    )
    if d["position"] == "D":
        d["position"] = "DST"
    data.append(d)

# convert to a dataframe
df_data = pd.DataFrame(data)

# check that the name_position_key is unique
assert df_data["name_position_key"].nunique() == len(df_data)

# manipulate some columns
df_data["salary"] = df_data["salary"].astype(int)
df_data["salary_change"] = df_data["salary_change"].astype(int)
df_data["points_per_game"] = df_data["points_per_game"].astype(float)
df_data["points_per_game_per_salary"] = df_data["points_per_game_per_salary"].astype(
    float
)

df_data.head()

number of players scraped: 562


Unnamed: 0,gid,position,name,team,opponent,home/away,salary,salary_change,total_points,games_played,points_per_game,points_per_game_per_salary,points_per_game_alt,bye_week,ytd_salary_high/low,name_position_key
0,1537,QB,"Murray, Kyler",ari,sea,A,8500,500,282.4,9,31.38,3.69,31.38,8,H,kyler-murray-qb
1,1501,QB,"Prescott, Dak",dal,min,A,4000,0,151.64,5,30.33,7.58,30.33,10,,dak-prescott-qb
2,5536,RB,"McCaffrey, Christian",car,det,H,9400,400,90.3,3,30.1,3.2,30.1,13,,christian-mccaffrey-rb
3,1412,QB,"Wilson, Russell",sea,ari,H,7400,-300,266.06,9,29.56,3.99,29.56,6,,russell-wilson-qb
4,5559,RB,"Cook, Dalvin",min,dal,H,9000,100,229.3,8,28.66,3.18,28.66,7,H,dalvin-cook-rb


## Save data

In [5]:
df_data.to_csv("historical_and_salary_data_week11.csv", index=False)