In [1]:
import pandas as pd

In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Read CSV file

In [2]:
# Load the CSV file
lotto_data = pd.read_csv("lotto_data_1_to_1148.csv")

In [4]:
# Display the first few rows to check the data
print(lotto_data.tail())

      Round        Date                  Numbers  Bonus
1143   1144  2024-11-02   [3, 4, 12, 15, 26, 34]      6
1144   1145  2024-11-09  [2, 11, 31, 33, 37, 44]     32
1145   1146  2024-11-16  [6, 11, 17, 19, 40, 43]     28
1146   1147  2024-11-23  [7, 11, 24, 26, 27, 37]     32
1147   1148  2024-11-30   [3, 6, 13, 15, 16, 22]     32


# Separate Numbers Columns

In [5]:
# Split 'Numbers' column into individual columns
numbers_df = pd.DataFrame(lotto_data['Numbers'].apply(eval).tolist(), columns=['Number1', 'Number2', 'Number3', 'Number4', 'Number5', 'Number6'])

In [6]:
# Merge the split columns back to the original DataFrame
lotto_data = pd.concat([lotto_data, numbers_df], axis=1)

In [7]:
# Drop the original 'Numbers' column
lotto_data.drop(columns=['Numbers'], inplace=True)

In [17]:
# Display the updated DataFrame
print(lotto_data.tail())

      Round        Date  Bonus  Number1  Number2  Number3  Number4  Number5  \
1143   1144  2024-11-02      6        3        4       12       15       26   
1144   1145  2024-11-09     32        2       11       31       33       37   
1145   1146  2024-11-16     28        6       11       17       19       40   
1146   1147  2024-11-23     32        7       11       24       26       27   
1147   1148  2024-11-30     32        3        6       13       15       16   

      Number6  
1143       34  
1144       44  
1145       43  
1146       37  
1147       22  


# Missing value handling

In [10]:
# Check for missing values
print("Missing values per column:")
print(lotto_data.isnull().sum())

Missing values per column:
Round      0
Date       0
Bonus      0
Number1    0
Number2    0
Number3    0
Number4    0
Number5    0
Number6    0
dtype: int64


In [11]:
# No missing values detected, no need to fill or drop rows
# lotto_data.fillna(-1, inplace=True)  # Not needed

# Data scaling

In [13]:
# Original (Non-scaled) data preparation
original_data = lotto_data.copy()  # Keep the original data for non-scaled processing

In [14]:
# Prepare scaled data using MinMaxScaler
scaler = MinMaxScaler()  # Initialize the MinMaxScaler
scaled_data = lotto_data.copy()  # Copy the original data for scaling
scaled_data[['Number1', 'Number2', 'Number3', 'Number4', 'Number5', 'Number6']] = scaler.fit_transform(
    scaled_data[['Number1', 'Number2', 'Number3', 'Number4', 'Number5', 'Number6']]
)  # Apply scaling to the number columns

In [15]:
print(scaled_data.tail())

      Round        Date  Bonus   Number1   Number2   Number3   Number4  \
1143   1144  2024-11-02      6  0.058824  0.057143  0.243243  0.263158   
1144   1145  2024-11-09     32  0.029412  0.257143  0.756757  0.736842   
1145   1146  2024-11-16     28  0.147059  0.257143  0.378378  0.368421   
1146   1147  2024-11-23     32  0.176471  0.257143  0.567568  0.552632   
1147   1148  2024-11-30     32  0.058824  0.114286  0.270270  0.263158   

       Number5   Number6  
1143  0.485714  0.592593  
1144  0.800000  0.962963  
1145  0.885714  0.925926  
1146  0.514286  0.703704  
1147  0.200000  0.148148  


# Dataset Separation

In [18]:
# Split non-scaled data into training and testing sets
X_original = original_data[['Number1', 'Number2', 'Number3', 'Number4', 'Number5', 'Number6']]  # Features for non-scaled
y_original = original_data['Bonus']  # Target for non-scaled
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
    X_original, y_original, test_size=0.2, random_state=42  # 80% training, 20% testing
)

In [19]:
# Split scaled data into training and testing sets
X_scaled = scaled_data[['Number1', 'Number2', 'Number3', 'Number4', 'Number5', 'Number6']]  # Features for scaled
y_scaled = scaled_data['Bonus']  # Target for scaled
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42  # 80% training, 20% testing
)

In [20]:
# Display the size of each dataset
print("Original Data - Training set size:", X_train_original.shape)  # Display training set size (non-scaled)
print("Original Data - Testing set size:", X_test_original.shape)    # Display testing set size (non-scaled)
print("Scaled Data - Training set size:", X_train_scaled.shape)      # Display training set size (scaled)
print("Scaled Data - Testing set size:", X_test_scaled.shape)        # Display testing set size (scaled)

Original Data - Training set size: (918, 6)
Original Data - Testing set size: (230, 6)
Scaled Data - Training set size: (918, 6)
Scaled Data - Testing set size: (230, 6)
