In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from matplotlib import pyplot as plt

In [None]:
SAMPLE_ORDER = list(range(530827))
np.random.shuffle(SAMPLE_ORDER)

In [None]:
points = pd.read_csv("../data/flowchart-data/charting-m-points.csv", encoding='unicode_escape')

In [None]:
points

### plan

go through each match and keep a column for who won each individual point. also who was serving for each point. so make a separate table out of this and then we can try making one with features for the last 10, etc.

oh also we should keep a column for who was in the match

In [None]:
trimmed = points[points["match_id"] > "1991"][["match_id", "PtWinner", "isSvrWinner"]]
trimmed["p1Win"] = trimmed["PtWinner"] == 1
trimmed["isSvrWinner"] = trimmed["isSvrWinner"] == 1
trimmed["p1Served"] = trimmed["p1Win"] == trimmed["isSvrWinner"]
del(trimmed["PtWinner"])

In [None]:
points_dict = trimmed.to_dict("records")
points_dict

In [None]:
current_match = ""
p1Win_p1Served = 0
p1Win_p2Served = 0
p2Win_p1Served = 0
p2Win_p2Served = 0
last5_p1Served = []
last5_p2Served = []

for p in points_dict:
  if p["match_id"] != current_match:
    p1Win_p1Served = 0
    p1Win_p2Served = 0
    p2Win_p1Served = 0
    p2Win_p2Served = 0
    last5_p1Served = [None] * 5
    last5_p2Served = [None] * 5
    current_match = p["match_id"]
  p["p1Win_p1Served"] = p1Win_p1Served
  p["p1Win_p2Served"] = p1Win_p2Served
  p["p2Win_p1Served"] = p2Win_p1Served
  p["p2Win_p2Served"] = p2Win_p2Served
  p["p1Served1"] = last5_p1Served[-1]
  p["p1Served2"] = last5_p1Served[-2]
  p["p1Served3"] = last5_p1Served[-3]
  p["p1Served4"] = last5_p1Served[-4]
  p["p1Served5"] = last5_p1Served[-5]
  p["p2Served1"] = last5_p2Served[-1]
  p["p2Served2"] = last5_p2Served[-2]
  p["p2Served3"] = last5_p2Served[-3]
  p["p2Served4"] = last5_p2Served[-4]
  p["p2Served5"] = last5_p2Served[-5]
  if p["p1Served"]:
    last5_p1Served.append(p["p1Win"])
    p1Win_p1Served += p["p1Win"]
    p2Win_p1Served += not p["p1Win"]
    last5_p1Served.pop(0)
  else:
    last5_p2Served.append(p["p1Win"])
    p1Win_p2Served += p["p1Win"]
    p2Win_p2Served += not p["p1Win"]
    last5_p2Served.pop(0)

In [None]:
with_last5 = pd.DataFrame.from_dict(points_dict)
with_last5

In [None]:
with_last5 = with_last5.dropna(axis=0, how="any")

In [None]:
serve_agg = with_last5.groupby("match_id").max(["p1Win_p1Served", "p2Win_p1Served", "p1Win_p2Served", "p2Win_p2Served"])

In [None]:
serve_sums = serve_agg[["p1Win_p1Served", "p2Win_p1Served", "p1Win_p2Served", "p2Win_p2Served"]].sum()

the serving player wins about 64% of the time, basically

In [None]:
(serve_sums[0] + serve_sums[3]) / sum(serve_sums)

In [None]:
win_pct = with_last5.copy()
win_pct["pct_p1Served"] = (win_pct["p1Win_p1Served"] + 32) / (win_pct["p1Win_p1Served"] + win_pct["p2Win_p1Served"] + 50.0)
win_pct["pct_p2Served"] = (win_pct["p1Win_p2Served"] + 18) / (win_pct["p1Win_p2Served"] + win_pct["p2Win_p2Served"] + 50.0)
del(win_pct["p1Win_p1Served"])
del(win_pct["p2Win_p1Served"])
del(win_pct["p1Win_p2Served"])
del(win_pct["p2Win_p2Served"])

In [None]:
win_pct_np = win_pct.iloc[:, 1:].to_numpy("float")[SAMPLE_ORDER]

In [None]:
y = win_pct_np[:, 1]
x_seq = win_pct_np[:, 2:]
x_no_seq = win_pct_np[:, [2, 13, 14]]

In [None]:
server_wins = win_pct_np.copy()
server_wins[win_pct_np[:, 2] == 0] = 1 - server_wins[win_pct_np[:, 2] == 0][:, list(range(0, 3)) + list(range(8, 13)) + list(range(3, 8)) + [14, 13]]
server_wins

In [None]:
X = server_wins[:, 3:]
y = server_wins[:, 1]
np.mean(X, axis=0)

the first two are your accuracy with all the features. the middle two are your accuracy only knowing each player's win % so far on points they've served. the last two are your accuracy only knowing the outcomes of the last 5 points each player has served.

In [None]:
linear_seq = LinearRegression().fit(X[:400000], y[:400000])
linear_no_seq = LinearRegression().fit(X[:400000, 10:], y[:400000])
linear_only_last5 = LinearRegression().fit(X[:400000, :10], y[:400000])
log_seq = LogisticRegression().fit(X[:400000], y[:400000])
log_no_seq = LogisticRegression().fit(X[:400000, 10:], y[:400000])
log_only_last5 = LogisticRegression().fit(X[:400000, :10], y[:400000])

(
  np.mean(np.square(y[400000:] - log_seq.predict_proba(X[400000:, ])[:, 1])),
  np.mean(np.square(y[400000:] - linear_seq.predict(X[400000:, ]))),
  np.mean(np.square(y[400000:] - log_no_seq.predict_proba(X[400000:, 10:])[:, 1])),
  np.mean(np.square(y[400000:] - linear_no_seq.predict(X[400000:, 10:]))),
  np.mean(np.square(y[400000:] - linear_only_last5.predict(X[400000:, :10]))),
  np.mean(np.square(y[400000:] - log_only_last5.predict(X[400000:, :10])))
)
  

for comparison: your accuracy if you only know who serves

In [None]:
np.square((1 - np.mean(y[:400000]))) * np.mean(y[400000:]) + np.square(np.mean(y[:400000])) * (1 - np.mean(y[400000:]))

In [None]:
plt.hist(log_seq.predict_proba(X)[:100, 1])
plt.xlabel("Prediction")
plt.ylabel("Density")
plt.title("Prediction Densities for Logistic Regression with Sequential Data")
plt.savefig("../plots/logseq.png")

In [None]:
p = sns.histplot(linear_seq.predict(X))
p.set(xlabel = 'Prediction', ylabel = 'Density', title="Probability Densities for Linear Regression with Sequential Data")

In [None]:
sns.histplot(linear_no_seq.predict(X[:, 0:2]))

In [None]:
sns.histplot(linear_only_last5.predict(X[:, 0:10]))

In [None]:
sns.histplot(log_only_last5.predict_proba(X[:,0:10]))

In [None]:
print(np.array2string(linear_seq.coef_, precision=5, suppress_small=True))

### TA meeting feedback

in presentation: list our variables both mathematically and in language

add more features and see if that boosts performance

try adding a "# of points played in match" feature (normalized)

try LASSO or ridge regression?