In [1]:
import pandas as pd
# GAME_ID HOME_TEAM_ID VISITOR_TEAM_ID HOME_TEAM_WINS SEASON


# Load the NBA dataset into a DataFrame
nba_df = pd.read_csv('games.csv')

# Find the 15 teams that have played the most number of matches
all_teams = set(nba_df['HOME_TEAM_ID'].unique()) | set(nba_df['VISITOR_TEAM_ID'].unique())
team_match_counts = {}

for team_id in all_teams:
    home_matches = nba_df[nba_df['HOME_TEAM_ID'] == team_id]
    away_matches = nba_df[nba_df['VISITOR_TEAM_ID'] == team_id]
    total_matches = len(home_matches) + len(away_matches)
    team_match_counts[team_id] = total_matches

top_teams = sorted(team_match_counts, key=team_match_counts.get, reverse=True)[:15]

# Filter matches involving the top 15 teams
filtered_matches = nba_df[(nba_df['HOME_TEAM_ID'].isin(top_teams)) & (nba_df['VISITOR_TEAM_ID'].isin(top_teams))]

# Sort the filtered matches by season from most recent to old
filtered_matches = filtered_matches.sort_values(by='SEASON', ascending=False)

# Create a new DataFrame with the desired columns
# new_df = filtered_matches[['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS', 'SEASON']]

# Rename columns for clarity
# new_df = new_df.rename(columns={
#     'HOME_TEAM_ID': 'Home_team_id',
#     'VISITOR_TEAM_ID': 'Away_team_id',
#     'HOME_TEAM_WINS': 'Winner'
# })

# Save the new DataFrame to a CSV file
# new_df.to_csv('filtered_nba_matches.csv', index=False)


In [2]:
print(top_teams)

[1610612748, 1610612738, 1610612759, 1610612747, 1610612744, 1610612742, 1610612739, 1610612745, 1610612743, 1610612754, 1610612737, 1610612760, 1610612765, 1610612746, 1610612749]


In [26]:
years = [year for year in range(2018, 2022)]
prob_matrices = {}
count_matrices = {}

print(str(filtered_matches['SEASON']))

2        2022
329      2022
381      2022
379      2022
372      2022
         ... 
18315    2003
18307    2003
18306    2003
18304    2003
19017    2003
Name: SEASON, Length: 6707, dtype: int64


In [38]:
# Calculate the number of times j defeats i for each year
# i = Host vs j =  away => p_host,away = + 1 if away defeats host
total = 0
total1 = 0
total2 = 0
cumulative_prob_matrix = pd.DataFrame(0, index=top_teams, columns=top_teams)
cumulative_count_matrix = pd.DataFrame(0, index=top_teams, columns=top_teams)
nested_list = pd.DataFrame(columns=top_teams, index=top_teams)

# Fill the DataFrame with empty lists
for team1 in top_teams:
    for team2 in top_teams:
        nested_list.at[team1, team2] = []


for year in years:
    matches = filtered_matches[filtered_matches['SEASON'] == year]
    prob_matrix = pd.DataFrame(0, index=top_teams, columns=top_teams)
    count_matrix = pd.DataFrame(0, index=top_teams, columns=top_teams)

    print(matches['GAME_ID'])

    for _,match in matches.iterrows():
        # print(str(match['Match Date']), str(match['Host_Country']), str(match['Team 1']), str(match['Team 2']), str(match['Winner']))
        total += 1
        count_matrix.at[match['HOME_TEAM_ID'], match['VISITOR_TEAM_ID']] += 1
        # print(match['Winner'])
        if match['HOME_TEAM_WINS']:
            total1 += 1
            prob_matrix.at[match['HOME_TEAM_ID'], match['VISITOR_TEAM_ID']] += 1
            nested_list.at[match['HOME_TEAM_ID'], match['VISITOR_TEAM_ID']].append((top_teams.index(match['VISITOR_TEAM_ID']), top_teams.index(match['HOME_TEAM_ID'])))
        else:
            total2 += 1
            nested_list.at[match['HOME_TEAM_ID'], match['VISITOR_TEAM_ID']].append((top_teams.index(match['HOME_TEAM_ID']), top_teams.index(match['VISITOR_TEAM_ID'])))
    cumulative_prob_matrix += prob_matrix
    cumulative_count_matrix += count_matrix

    print(count_matrix.values.sum(), prob_matrix.values.sum())
    prob_matrices[year] = prob_matrix
    count_matrices[year] = count_matrix
    # print(prob_matrix)
# Display the year-wise probability matrix

# print(cumulative_prob_matrix)
print(cumulative_count_matrix)
#print(np.sum(cumulative_prob_matrix, 'all'))

20545    21800416
20546    21800417
20542    21800426
20513    21800454
20539    21800423
           ...   
20049    21800917
20046    21800915
20040    21800925
20036    21800934
20026    21800940
Name: GAME_ID, Length: 347, dtype: int64
347 207
3976    21900387
3979    21900374
3980    21900373
3938    21900417
3987    21900368
          ...   
3513    21900846
3509    21900842
3501    21900849
3497    21900860
3491    21900869
Name: GAME_ID, Length: 328, dtype: int64
328 178
2777    22000424
2797    22000397
2791    22000407
2789    22000419
2787    22000416
          ...   
2366    22000756
2363    22000767
2362    22000766
2356    22000774
2355    22000321
Name: GAME_ID, Length: 292, dtype: int64
292 153
1486    22100381
1482    22100389
1456    22100407
1479    22100386
1476    22100383
          ...   
956     22100917
955     22100916
1014    22100856
962     22100908
967     22100897
Name: GAME_ID, Length: 348, dtype: int64
348 196
            1610612748  1610612738  161061275

In [28]:
print(cumulative_count_matrix.values.sum())
print(total,total1, total2)
print(cumulative_prob_matrix.values.sum())
print(len(top_teams))

1315
1315 734 581
734
15


In [2]:
import numpy as np

T = np.zeros(len(years))
yy = 0
for year in years:
  Xy = prob_matrices[year].values
  Cy = count_matrices[year].values
  Shat = np.where(Cy != 0, Xy / Cy, 0)/len(top_teams)

  print(Shat)
  for i in range(len(top_teams)):
    Shat[i, i] = 1 - np.sum(Shat[i, :])
  eigenvalues, eigenvectors = np.linalg.eig(Shat.T)
  pihat = eigenvectors[:, np.argmax(eigenvalues)]
  # pihat, D = np.linalg.eig(Shat.values.T)
  pihat = pihat / np.sum(pihat)
  
  print(pihat,prob_matrices[year])
  for i in range(len(top_teams)):
    for j in range(len(top_teams)):
        if Cy[i,j] > 1:
          T[yy] = T[yy] + (pihat[i] + pihat[j])**2*Xy[i,j]*(Xy[i,j] - 1)/(Cy[i,j]*(Cy[i,j]-1)) + pihat[j]**2 - 2*(pihat[i] + pihat[j])*pihat[j]*Xy[i,j]/Cy[i,j]
  yy += 1




NameError: name 'years' is not defined

In [29]:
import numpy as np
T = 0
yy = 0
Xy = cumulative_prob_matrix
Cy = cumulative_count_matrix

Shat = np.where(Cy != 0, Xy / Cy, 0)/len(top_teams)
print(Xy, Cy,Shat)

for i in range(len(top_teams)):
  Shat[i, i] = 1 - np.sum(Shat[i, :])
eigenvalues, eigenvectors = np.linalg.eig(Shat.T)
pihat = eigenvectors[:, np.argmax(eigenvalues)]
# pihat, D = np.linalg.eig(Shat.values.T)
pihat = pihat / np.sum(pihat)
print('hello')
print(pihat)
for i0 in range(len(top_teams)):
  for j0 in range(len(top_teams)):
    i = top_teams[i0]
    j = top_teams[j0]
    if Cy.at[i,j] > 1:
       T = T + (pihat[i0] + pihat[j0])**2*Xy.at[i,j]*(Xy.at[i,j] - 1)/(Cy.at[i,j]*(Cy.at[i,j]-1)) + pihat[j0]**2 - 2*(pihat[i0] + pihat[j0])*pihat[j0]*Xy.at[i,j]/Cy.at[i,j]





            1610612748  1610612738  1610612759  1610612747  1610612744  \
1610612748           0           6           5           3           3   
1610612738           6           0           1           2           3   
1610612759           2           3           0           3           4   
1610612747           5           2           3           0           7   
1610612744           3           2           2           5           0   
1610612742           1           3           5           3           6   
1610612739           2           4           2           0           0   
1610612745           2           3           5           4           4   
1610612743           4           2           9           4           4   
1610612754           3           5           1           3           0   
1610612737           7           3           2           2           3   
1610612760           1           0           4           2           2   
1610612765           2           3    

In [30]:
def compute_test_statistic(pihat,Z,K):
    n = len(pihat)
    T = 0
    # print(Z)
    for i in range(n):
        for j in range(n):
            pij_hat = pihat[i] + pihat[j]
            if K[i,j] > 1 and i!=j:
                T += ((pij_hat)**2 * (Z[i, j] * (Z[i, j] - 1))) / (K[i,j] * (K[i,j] - 1))  +    pihat[j]**2    -    2 * pihat[j] * pij_hat * (Z[i,j] / K[i,j])

    return T

In [31]:
def compute_stationary_distribution(S):
    eigenvalues, eigenvectors = np.linalg.eig(S.T)
    dominant_eigenvalue_index = np.argmax(np.abs(eigenvalues))
    dominant_eigenvalue = eigenvalues[dominant_eigenvalue_index]
    stationary_dist = np.real_if_close(np.abs(eigenvectors[:, dominant_eigenvalue_index]))

    # Normalize the stationary distribution
    stationary_dist /= np.sum(stationary_dist)

    return stationary_dist

In [32]:
def permute_entries(Z, K):
    n = len(Z)
    new_Z = [[[] for _ in range(n)] for _ in range(n)]  # Initialize new_Z as a nested list

    for i in range(n):
        for j in range(n):
            if i != j:
                # Combine Z[i][j] and Z[j][i] into a single list
                combined_entries = Z[i][j] + Z[j][i]
                
                # Shuffle the elements in the combined list
                random.shuffle(combined_entries)

                # Assign the first K[i][j] elements to Z[i][j]
                new_Z[i][j] = combined_entries[:K[i][j]]

                # Assign the last K[j][i] elements to Z[j][i]
                new_Z[j][i] = combined_entries[-K[j][i]:]

    return new_Z


In [33]:
def construct_Z_empirical_and_Kij(Zlist):
    n = len(Zlist)
    Z_empirical = [[0] * n for _ in range(n)]
    Kij = [[0] * n for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i!= j:
                count = sum(entry == (j, i) for entry in Zlist[i][j])
                Z_empirical[i][j] = count
                Kij[i][j] = len(Zlist[i][j])
        Kij[i][i] = 2
    return Z_empirical, Kij

In [34]:
def compute_Tfromlist(Z):
    Zemp , K = construct_Z_empirical_and_Kij(Z)
    n = len(K)
    Zemp =np.array(Zemp)
    K = np.array(K)
    # print(Zemp)
    # print(K)
    # print(np.sum(K, axis=1))
    Shat = np.where((K == 0), 0, Zemp / (K * n))
    for i in range(n):
        Shat[i, i] = 1 - np.sum(Shat[i, :])

    pihat = compute_stationary_distribution(Shat)
    T0 = compute_test_statistic(pihat,Zemp,K)

    return T0

In [35]:
import copy

def cycle_completion(pi, Zlist):
    n = len(pi)
    i = np.random.choice(range(n))
    Zlist2 = copy.deepcopy(Zlist)
    cycle = [i]
    lll = 0
    success =  False
    while lll < 10*n:
        j = np.random.choice(range(n))
        lll += 1
        # print(i,j,Zlist2) 
        if j != i and  len(Zlist2[i][j]) > 0:
            entry_num = np.random.choice(range(len(Zlist2[i][j])))
            entry = Zlist2[i][j][entry_num]
            #print(entry,entry[::-1], j,i)
            if entry == (j, i):
                Zlist2[i][j].remove(entry)
                Zlist2[j][i].append(entry[::-1])
                i = j
                lll = 0
                cycle.append(j)
                if i == cycle[0]:
                    success = True
                    break
    if success == False:
         print('I could not commplete the cycle')
            #   print(entry,entry[::-1], j,i)
    return Zlist2, success

In [37]:
import random
n = 15 

permuted_T1 = []
permuted_T2 = []
newZ = cumulative_prob_matrix.values
newK = cumulative_count_matrix.values
Zlist = nested_list.values
T0 = compute_Tfromlist(Zlist)


for u in range(300):
    permZ = permute_entries(Zlist,newK)
    
    permuted_T1.append(compute_Tfromlist(permZ))
    for _ in range(40):
        status = False
        while not status: # sometimes the chain is not irreducible thats why
            dummy, status = cycle_completion(np.ones(n)/n, permZ)
    permZ = dummy
    permuted_T2.append(compute_Tfromlist(permZ))
    print(n*permuted_T1[-1],n*permuted_T2[-1])
# Sort permuted test statistics
sorted_T1 = np.sort(permuted_T1)
sorted_T2 = np.sort(permuted_T2)

# Calculate the 95% value
percentile_index = int(0.95 * len(permuted_T1))
T1_95th = sorted_T1[percentile_index]
T2_95th = sorted_T2[percentile_index]
print(n*T0,n*T1_95th, n*T2_95th)




-0.5487401197422471 -0.4658716555282629
-0.8881583714437897 -0.9106370049145499
-0.04160798874475766 0.03427774263889064
-0.42503422461177154 -0.45708791044180147
-0.7689439066031556 -0.7737714085058373
-0.21349656461184038 -0.06482932959653392
0.2119727790242215 0.1695945079420331
0.0207108865410006 -0.05435860832142044
-0.31061864299341485 -0.3233986706734455
-0.6754334870432335 -0.6175284151274392
-1.0534654766431424 -0.6979808830644757
-0.46997310241419243 -0.6252770142473808
-0.199485921231917 0.17552067768109786
-0.4670692269334076 -0.5364430424643722
-0.295787990400422 -0.2367407109867677
-0.6214411594033883 -0.6883802837018939
-0.41658254290675667 -0.2621806848921269
-0.5547264873049597 -0.5765785343570102
-0.9273022966843057 -0.8848817757807869
-0.6287923819875914 -0.6958288793957912
-0.5687752150672681 -0.6094555148369519
-0.35567096461028247 -0.3264004416726314
-0.10679852616754813 -0.1270896944208159
-0.615511962156126 -0.44724893034211494
-0.8379118459717412 -0.82875264884

-0.9574876958939995 -1.0028668472346522
-0.6549798092551417 -0.5217411250064808
-0.36711228135898794 -0.3922767177388552
-0.19804170636089077 -0.1335955174057667
-0.8571459626861606 -0.9121075512815165
-0.4249861808824394 -0.4249861808824394
-0.88766090898136 -0.793530357251969
-0.34423073240849844 -0.41324037696338317
-0.6809831370452369 -0.736590284804501
-0.7519817745569849 -0.7830499192274424
-0.4310042839770818 -0.4177677075484458
-0.7216887879018876 -0.7114471564304724
-0.23585516057282005 -0.26439910868793465
-1.1486983043789296 -1.0869441105605846
-0.622240568750591 -0.5888982251314884
-0.955304167385493 -0.9764426073393241
-0.7419308922444474 -0.7419308922444474
0.0404951256138883 0.08949857119259629
-0.6818863539877069 -0.5604983925820086
-0.3181334956306713 -0.3603628361488497
-0.5116153562943793 -0.7181621528211394
-0.49089862858219946 -0.5899459589613685
-0.2905403528221739 -0.2881119112003921
0.09378338701790478 0.2655671487585072
-0.4686679222428124 -0.47064454523171684


In [3]:
np.arange(10, 101, 15)

array([ 10,  25,  40,  55,  70,  85, 100])

In [4]:
a = [-0.014135029428783835, -0.01035334896443867, -0.019754493929506213, -0.0211277497556467, -0.010871098459750546, -0.01733415353471417, -0.020074245825828285, -0.011187088603470362, -0.00973494533792751, -0.013863765963976942, -0.021605773473217176, -0.004048517632813772, -0.013197326039448214, -0.007634512764557639, -0.01297837958452312, -0.016965143947566583, -0.009151435715771095, -0.016137993478072696, -0.018691295895308387, -0.011293953032500938, -0.011349785165692601, -0.01059676473639074, -0.014398930142051663, -0.010649126180015256, -0.014588406356499481, -0.015510487149642113, -0.014387841868063427, -0.02007671890156827, -0.012142984618324332, -0.017679774454733642, -0.01804212414828632, -0.017209358935761525, -0.012957712157555078, -0.01648465907723356, -0.016398744351662897, -0.008944908307521498, -0.014670652470735072, -0.017482694781960774, -0.013204843187454643, -0.00791482098903216, -0.016092786134264524, -0.010982906776447048, -0.020378515726874484, -0.015831960903200947, -0.0151120677183842, -0.017753384289324364, -0.02076081523403781, -0.014885845102372394, -0.015832740561163156, -0.015967002352225063, -0.011808486146165864, -0.01227012411099735, -0.006692330664098672, -0.01787728577577222, -0.01286512032137645, -0.019946283029483328, -0.01442681802650387, -0.008370631121279322, -0.020064241202914813, -0.006470609707310915, -0.02174378421687933, -0.008986000607894357, -0.010793313596878901, -0.015425385290896846, -0.008881488775572814, -0.016179999338008007, -0.019475379614770114, -0.016827449669873786, -0.009301888908754555, -0.009799711910672956, -0.012579029340689924, -0.009738843576478407, -0.014146329611988034, -0.01379057817476268, -0.008102507836644022, -0.022451451752081533, -0.008451691271673629, -0.01534769352022086, -0.008671329585756981, -0.01493100046579851, -0.018637156706730454, -0.013954412285973808, -0.015192124524630113, -0.01941966478737369, -0.015337698264636544, -0.009928794531901451, -0.01664519717927959, -0.01080885074118369, -0.017450184402762126, -0.012879531302655227, -0.018946467961225046, -0.01308256797319917, -0.012318554947109669, -0.01723309802714146, -0.014817119572594383, -0.022200282293712365, -0.015482837090716685, -0.016906163878163703, -0.017555987787186686, -0.00995377630922472]
np.sort(a)*35*8

array([-6.28640649, -6.21607904, -6.08825958, -6.04961657, -5.91576993,
       -5.81302827, -5.7059844 , -5.62148129, -5.62078883, -5.61798754,
       -5.58495925, -5.5312583 , -5.45310629, -5.43750614, -5.30501103,
       -5.23356285, -5.21840388, -5.05179476, -5.00564002, -4.9709476 ,
       -4.95033685, -4.91567658, -4.89515454, -4.88605163, -4.85356299,
       -4.82526745, -4.8186205 , -4.75024031, -4.73372589, -4.71168591,
       -4.66065521, -4.61570454, -4.59164842, -4.53039981, -4.51863817,
       -4.50598012, -4.47076066, -4.43316736, -4.43294905, -4.3429364 ,
       -4.33519439, -4.31910788, -4.29735419, -4.29455551, -4.25379487,
       -4.23137896, -4.18068013, -4.16803663, -4.14879348, -4.10778269,
       -4.08475378, -4.03950905, -4.03170044, -4.02859572, -3.96097229,
       -3.95780824, -3.90723544, -3.88185447, -3.86136189, -3.69735609,
       -3.69525129, -3.66311903, -3.63394628, -3.6281594 , -3.60626876,
       -3.60223369, -3.52212822, -3.44919539, -3.43563475, -3.40