In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow

In [2]:
kobe_data = pd.read_csv('./../Data/Raw/kobe_dataset.csv')
kobe_data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [3]:
# Removing rows with missing data
kobe_data = kobe_data.dropna()

# Filtering rows where shot_type is equal to '2PT Field Goal'
kobe_data = kobe_data[kobe_data['shot_type'] == '2PT Field Goal']

# Selecting only the necessary columns
selected_columns = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']
kobe_data = kobe_data[selected_columns]

# Display the resulting DataFrame
print(kobe_data)

# Calculate the class distribution for 'shot_made_flag'
class_distribution = kobe_data['shot_made_flag'].value_counts(normalize=True) * 100

# Display the class distribution to check balance
print("Class Distribution for 'shot_made_flag':\n", class_distribution)

           lat       lon  minutes_remaining  period  playoffs  shot_distance  \
1      34.0443 -118.4268                 10       1         0             15   
2      33.9093 -118.3708                  7       1         0             16   
3      33.8693 -118.1318                  6       1         0             22   
4      34.0443 -118.2698                  6       2         0              0   
5      34.0553 -118.4148                  9       3         0             14   
...        ...       ...                ...     ...       ...            ...   
30690  33.9443 -118.3828                 11       4         1             15   
30691  34.0443 -118.2698                  7       4         1              0   
30692  33.9963 -118.2688                  6       4         1              4   
30694  33.8783 -118.4038                  3       4         1             21   
30696  33.9723 -118.2688                  0       4         1              7   

       shot_made_flag  
1              

In [4]:
# Save Dataframe to binary
kobe_data.to_parquet("./../Data/Processed/data_filtered.parquet")

# Get the dimension of the resulting dataset
print("Dimensão do dataset resultante:", kobe_data.shape)

# Split the dataset into features (X) and target (y)
X = kobe_data.drop("shot_made_flag", axis=1)
y = kobe_data["shot_made_flag"]

# Split the data into training and test sets in a stratified way
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save the training and test sets in parquet files
X_train.join(y_train).to_parquet("./../Data/Modeling/base_train.parquet")
X_test.join(y_test).to_parquet("./../Data/Modeling/base_test.parquet")

# Register parameters and metrics in MLflow
with mlflow.start_run():
    mlflow.log_param("test_size", 0.2)
    mlflow.log_metric("train_size", len(X_train))
    mlflow.log_metric("test_size", len(X_test))

Dimensão do dataset resultante: (20285, 7)
