### Timeseries cluster analysis

In [1]:
# Initial imports
import pandas as pd

## Initial Settings
pd.set_option('max_columns', 60)

In [2]:
# Load the NBA Stats dataset
clusters_df = pd.read_csv("Resources/clustered_dataset.csv")
clusters_df.head()

Unnamed: 0,Player,year_born,Cluster 1,VORP 1,WS 1,Cluster 2,VORP 2,WS 2,Cluster 3,VORP 3,WS 3,Cluster 4,VORP 4,WS 4,Cluster 5,VORP 5,WS 5,Cluster 6,VORP 6,WS 6,Cluster 7,VORP 7,WS 7,Cluster 8,VORP 8,WS 8,Cluster 9,VORP 9,WS 9,Cluster 10,...,Cluster 12,VORP 12,WS 12,Cluster 13,VORP 13,WS 13,Cluster 14,VORP 14,WS 14,Cluster 15,VORP 15,WS 15,Cluster 16,VORP 16,WS 16,Cluster 17,VORP 17,WS 17,Cluster 18,VORP 18,WS 18,Cluster 19,VORP 19,WS 19,Cluster 20,VORP 20,WS 20,Cluster 21,VORP 21,WS 21
0,Tariq Abdul-Wahad,1974,,,,,,,,,,,,,,,,4.0,0.1,0.4,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Shareef Abdur-Rahim,1976,,,,,,,,,,,,,,,,,,,8.0,2.8,9.7,8.0,1.6,8.1,3.0,1.3,5.4,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Courtney Alexander,1977,,,,,,,7.0,-1.2,1.1,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Malik Allen,1978,,,,3.0,-1.1,0.9,6.0,-0.4,0.7,0.0,0.0,1.1,0.0,-0.3,1.0,6.0,-0.5,1.0,7.0,-0.5,1.0,6.0,-0.4,0.3,6.0,-0.3,0.1,6.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Ray Allen,1975,,,,,,,,,,,,,,,,,,,1.0,4.2,9.1,1.0,3.3,6.9,1.0,4.4,10.7,1.0,...,2.0,3.4,9.7,2.0,4.3,11.1,2.0,2.6,7.9,2.0,4.1,10.0,7.0,1.8,4.7,2.0,1.1,5.4,2.0,1.0,4.1,,,,,,,,,


### DataFrame Verification

In [5]:
# Select line 5 starting from 0
example = clusters_df.iloc[4,:]
example

Player        Ray Allen
year_born          1975
Cluster 1           NaN
VORP 1              NaN
WS 1                NaN
                ...    
VORP 20             NaN
WS 20               NaN
Cluster 21          NaN
VORP 21             NaN
WS 21               NaN
Name: 4, Length: 65, dtype: object

In [6]:
# Confirm if na
pd.isna(example["Cluster 10"])

False

In [7]:
example["Cluster 1"]

nan

In [5]:
improvements = []
regressions = []


# Loop accross clusters
for index, example in clusters_df.iterrows():
    
    # To edit since at the moment we have 21 clusters
    for i in range(1,21):
        if not pd.isna(example[f"Cluster {i}"]) and not pd.isna(example[f"Cluster {i+1}"]) and (example[f"Cluster {i}"] != example[f"Cluster {i+1}"]):
            if example[f"VORP {i}"] > example[f"VORP {i+1}"]:
                regressions.append((int(example[f"Cluster {i}"]),int(example[f"Cluster {i+1}"])))
            else:
                improvements.append((int(example[f"Cluster {i}"]),int(example[f"Cluster {i+1}"])))

In [6]:
improvements

[(3, 6),
 (6, 0),
 (6, 7),
 (7, 6),
 (1, 2),
 (5, 1),
 (5, 2),
 (0, 3),
 (0, 3),
 (5, 7),
 (4, 5),
 (4, 5),
 (7, 4),
 (4, 6),
 (0, 3),
 (7, 2),
 (6, 0),
 (0, 6),
 (5, 4),
 (4, 5),
 (5, 1),
 (5, 2),
 (3, 8),
 (3, 6),
 (0, 3),
 (0, 6),
 (6, 0),
 (3, 8),
 (3, 8),
 (3, 8),
 (8, 3),
 (4, 5),
 (3, 6),
 (6, 0),
 (0, 8),
 (6, 3),
 (0, 6),
 (0, 7),
 (5, 2),
 (4, 7),
 (6, 7),
 (0, 3),
 (3, 0),
 (0, 3),
 (3, 6),
 (5, 4),
 (4, 7),
 (3, 2),
 (2, 1),
 (7, 2),
 (7, 2),
 (7, 6),
 (0, 8),
 (3, 8),
 (4, 7),
 (6, 7),
 (4, 7),
 (2, 1),
 (7, 2),
 (7, 2),
 (5, 1),
 (0, 8),
 (3, 8),
 (0, 3),
 (3, 8),
 (4, 6),
 (4, 5),
 (6, 0),
 (0, 3),
 (0, 3),
 (0, 6),
 (0, 6),
 (6, 0),
 (0, 6),
 (5, 1),
 (5, 1),
 (2, 1),
 (5, 2),
 (0, 7),
 (7, 3),
 (3, 7),
 (6, 7),
 (7, 6),
 (8, 3),
 (3, 8),
 (3, 8),
 (7, 5),
 (5, 1),
 (5, 1),
 (4, 6),
 (5, 4),
 (4, 5),
 (6, 3),
 (4, 5),
 (6, 3),
 (3, 6),
 (0, 6),
 (4, 5),
 (4, 5),
 (7, 2),
 (2, 1),
 (7, 2),
 (5, 4),
 (0, 6),
 (0, 6),
 (0, 3),
 (0, 3),
 (5, 2),
 (4, 7),
 (6, 0),
 (6, 0),
 

In [7]:
regressions

[(8, 3),
 (0, 6),
 (2, 7),
 (7, 2),
 (1, 5),
 (2, 5),
 (3, 0),
 (3, 0),
 (3, 0),
 (2, 5),
 (5, 7),
 (7, 4),
 (5, 4),
 (2, 5),
 (7, 4),
 (1, 4),
 (5, 4),
 (5, 4),
 (5, 4),
 (5, 7),
 (5, 4),
 (4, 7),
 (3, 6),
 (5, 2),
 (2, 7),
 (2, 7),
 (5, 7),
 (7, 4),
 (3, 6),
 (2, 7),
 (7, 2),
 (2, 7),
 (7, 2),
 (1, 5),
 (2, 5),
 (5, 2),
 (2, 4),
 (1, 7),
 (7, 4),
 (8, 3),
 (3, 0),
 (0, 6),
 (8, 3),
 (8, 3),
 (2, 7),
 (0, 6),
 (6, 4),
 (4, 6),
 (0, 4),
 (5, 4),
 (0, 6),
 (3, 0),
 (8, 3),
 (3, 0),
 (0, 6),
 (7, 4),
 (3, 0),
 (7, 5),
 (2, 5),
 (5, 4),
 (3, 0),
 (8, 3),
 (4, 5),
 (5, 2),
 (7, 4),
 (7, 4),
 (1, 3),
 (1, 7),
 (7, 2),
 (2, 7),
 (7, 4),
 (2, 7),
 (2, 7),
 (8, 3),
 (8, 3),
 (3, 6),
 (0, 6),
 (2, 5),
 (5, 4),
 (7, 6),
 (7, 4),
 (7, 4),
 (5, 4),
 (1, 2),
 (2, 7),
 (2, 7),
 (1, 5),
 (1, 5),
 (3, 6),
 (3, 0),
 (8, 3),
 (8, 3),
 (3, 0),
 (8, 3),
 (5, 4),
 (5, 4),
 (3, 0),
 (0, 6),
 (3, 0),
 (3, 0),
 (0, 6),
 (3, 0),
 (6, 0),
 (1, 2),
 (2, 1),
 (1, 5),
 (1, 2),
 (1, 5),
 (2, 5),
 (7, 6),
 (3, 8),
 

In [8]:
# each combination = make a dictionnary. Keys are is the tuples and values is aggregation of tuples + to DF gor eacg regression and counter

## Variation in each cycle

In [9]:
from collections import Counter

In [10]:
Counter(improvements)

Counter({(3, 6): 11,
         (6, 0): 65,
         (6, 7): 32,
         (7, 6): 12,
         (1, 2): 10,
         (5, 1): 63,
         (5, 2): 30,
         (0, 3): 118,
         (5, 7): 26,
         (4, 5): 70,
         (7, 4): 28,
         (4, 6): 10,
         (7, 2): 119,
         (0, 6): 38,
         (5, 4): 30,
         (3, 8): 86,
         (8, 3): 17,
         (0, 8): 16,
         (6, 3): 26,
         (0, 7): 9,
         (4, 7): 69,
         (3, 0): 47,
         (3, 2): 23,
         (2, 1): 39,
         (7, 3): 11,
         (3, 7): 8,
         (7, 5): 26,
         (6, 4): 11,
         (2, 7): 28,
         (7, 8): 2,
         (8, 1): 5,
         (1, 8): 3,
         (6, 2): 3,
         (1, 5): 11,
         (7, 1): 5,
         (4, 2): 9,
         (2, 5): 6,
         (2, 8): 6,
         (3, 5): 1,
         (5, 8): 1,
         (4, 0): 3,
         (7, 0): 7,
         (3, 1): 2,
         (5, 6): 3,
         (8, 2): 3,
         (5, 3): 3,
         (6, 5): 1,
         (0, 2): 4,
         (