-
Notifications
You must be signed in to change notification settings - Fork 0
/
roto_guru.py
93 lines (77 loc) · 2.51 KB
/
roto_guru.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import re
import requests
def create_name_position_key(name: str, team: str, position: str) -> str:
"""Attempts to create a unique key for each player"""
if position == "D":
return f"{team}-dst"
names = [n.lower().strip() for n in name.split(",")]
names = [re.sub("[^a-z ]+", "", n) for n in names]
last_name = names[0].split(" ")[0]
first_name = names[-1]
return f"{first_name}-{last_name}-{position.lower()}"
def get_data() -> str:
url = "http://rotoguru1.com/cgi-bin/fstats.cgi"
params = {
"pos": 0,
"sort": 5,
"game": "p",
"colA": 0,
"daypt": 0,
"xavg": 1,
"inact": 0,
"maxprc": 99999,
"outcsv": 1,
}
r = requests.get(url, params=params)
assert r.status_code == 200
return r.text
def convert_raw_data_to_df(raw: str) -> pd.DataFrame:
# split on newlines
split_newline = raw.split("\n")
# if the first 4 characters in a line are digits, then it's player data (GID)
players = [line for line in split_newline if line[:4].isdigit()]
print("number of players scraped:", len(players))
# combine the headers with each row of data
headers = [
"gid",
"position",
"name",
"team",
"opponent",
"home/away",
"salary",
"salary_change",
"total_points",
"games_played",
"points_per_game",
"points_per_game_per_salary",
"points_per_game_alt",
"bye_week",
"ytd_salary_high/low",
]
data = []
for row in players:
split_row = row.split(";")
d = {k: v for k, v in zip(headers, split_row)}
d["name_position_key"] = create_name_position_key(
d["name"], d["team"], d["position"]
)
if d["position"] == "D":
d["position"] = "DST"
data.append(d)
# convert to a dataframe
df = pd.DataFrame(data)
# check that the name_position_key is unique
assert df["name_position_key"].nunique() == len(df)
# manipulate some columns
df["salary"] = df["salary"].astype(int)
df["salary_change"] = df["salary_change"].astype(int)
df["points_per_game"] = df["points_per_game"].astype(float)
df["points_per_game_per_salary"] = df["points_per_game_per_salary"].astype(float)
return df
def scrape_and_save_data(week: int) -> pd.DataFrame:
raw = get_data()
df = convert_raw_data_to_df(raw)
df.to_csv(f"data/roto_guru_data_week{week}.csv", index=False)
return df