### Random Forest example in predicting numbers
##### https://github.com/KN4KNG/LotteryNumberPredictor/tree/main

#### powerball

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from random import randint


In [2]:
power_url = r'https://www.texaslottery.com/export/sites/lottery/Games/Powerball/Winning_Numbers/powerball.csv'
mega_url = r'https://www.texaslottery.com/export/sites/lottery/Games/Mega_Millions/Winning_Numbers/megamillions.csv'


In [3]:
# read and save data from url
def get_data(url):
    df = pd.read_csv(url, header=None)
    df = df.iloc[:,:-1]
    df.columns = ['type', 'month', 'day', 'year', 'num1', 'num2','num3','num4','num5','num6']
    df['download_time'] = pd.Timestamp.today(tz='America/Chicago').strftime('%Y-%m-%d, %I:%M:%S %p %Z')
    df['date'] = pd.to_datetime(dict(year=df.year, month=df.month, day=df.day))
    df = df.reindex(columns=['type', 'date', 'num1', 'num2','num3','num4','num5','num6', 'download_time'])
    filename = df['type'].unique()[0]+ '_'+pd.Timestamp.today(tz='America/Chicago').strftime('%Y%m%d')
    df.to_csv('./data/'+filename+'.csv', index=False)
    return df


In [4]:
# get data
df = get_data(power_url)


In [25]:
df.describe()


Unnamed: 0,date,num1,num2,num3,num4,num5,num6
count,1713,1713.0,1713.0,1713.0,1713.0,1713.0,1713.0
mean,2018-01-19 20:54:13.239930112,33.317572,32.79568,33.641565,33.864565,33.928196,15.382954
min,2010-02-03 00:00:00,1.0,1.0,1.0,1.0,1.0,1.0
25%,2014-03-12 00:00:00,17.0,16.0,17.0,18.0,18.0,8.0
50%,2018-04-18 00:00:00,33.0,32.0,33.0,33.0,34.0,15.0
75%,2022-02-21 00:00:00,49.0,50.0,50.0,50.0,50.0,22.0
max,2024-11-16 00:00:00,69.0,69.0,69.0,69.0,69.0,39.0
std,,18.964601,19.385003,19.238966,19.068687,19.275429,9.046246


In [26]:
# get the data for modeling except the latest result for validating
# include only the number columns
df_model = df.iloc[:-1, 2:-1]


In [30]:
df_model.describe()


Unnamed: 0,num1,num2,num3,num4,num5,num6
count,1712.0,1712.0,1712.0,1712.0,1712.0,1712.0
mean,33.314836,32.800234,33.648364,33.872079,33.929322,15.382593
std,18.969804,19.38975,19.242528,19.071721,19.281004,9.048877
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,17.0,16.0,17.0,18.0,18.0,8.0
50%,33.0,32.0,33.0,33.0,34.0,15.0
75%,49.0,50.0,50.0,50.0,50.0,22.0
max,69.0,69.0,69.0,69.0,69.0,39.0


In [None]:
len(df_model)


1712

In [49]:
# Split the data into features (X) and target (y)
# X = df_model[['num1', 'num2', 'num3', 'num4', 'num5', 'num6']]
X = df_model.copy()
y = df_model.copy()


In [None]:
X.describe()


Unnamed: 0,num1,num2,num3,num4,num5,num6
count,1712.0,1712.0,1712.0,1712.0,1712.0,1712.0
mean,33.314836,32.800234,33.648364,33.872079,33.929322,15.382593
std,18.969804,19.38975,19.242528,19.071721,19.281004,9.048877
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,17.0,16.0,17.0,18.0,18.0,8.0
50%,33.0,32.0,33.0,33.0,34.0,15.0
75%,49.0,50.0,50.0,50.0,50.0,22.0
max,69.0,69.0,69.0,69.0,69.0,39.0


In [None]:
y.describe()


Unnamed: 0,num1,num2,num3,num4,num5,num6
count,1712.0,1712.0,1712.0,1712.0,1712.0,1712.0
mean,33.314836,32.800234,33.648364,33.872079,33.929322,15.382593
std,18.969804,19.38975,19.242528,19.071721,19.281004,9.048877
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,17.0,16.0,17.0,18.0,18.0,8.0
50%,33.0,32.0,33.0,33.0,34.0,15.0
75%,49.0,50.0,50.0,50.0,50.0,22.0
max,69.0,69.0,69.0,69.0,69.0,39.0


In [None]:
# split data into test and train
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size= 0.2, train_size= 0.8, random_state=104, shuffle= False, stratify=None)


In [None]:
len(X), len(y), len(x_test), len(y_test), len(x_train), len(y_train)


(1712, 1712, 343, 343, 1369, 1369)

In [None]:
# Train a Random Forest Regression model
model = RandomForestRegressor(n_estimators=1000, random_state=None)
model.fit(x_test, y_test)


In [None]:
model.score(x_test, y_test)


0.9694164876311788

In [None]:
predictions = model.predict(x_test)


#### ??? This section did not make sense as how the best alternative is selected

In [None]:
# Get the most likely set of numbers based on the predictions
most_likely_set = predictions[0]
for p in predictions:
    if p[0] > most_likely_set[0]:
        print(p)
        print('yes')
        most_likely_set = p

#### ########################


[53.684 64.249 40.298 44.996 61.96   7.373]
yes
[61.692 11.74  31.359  8.446 15.245 18.785]
yes
[65.493 20.676 20.989 63.531  9.008 21.154]
yes
[65.732 21.912 17.335 63.725  7.929 12.414]
yes


In [None]:
predictions[0]


array([ 5.878, 33.442, 16.325, 28.062, 35.699, 18.441])

In [None]:
most_likely_set


array([65.732, 21.912, 17.335, 63.725,  7.929, 12.414])

In [None]:
predictions[100]


array([17.727, 53.701, 61.081, 55.667, 53.534, 11.16 ])

In [None]:
predictions[1][0]


53.684

In [None]:
most_likely_set


array([65.732, 21.912, 17.335, 63.725,  7.929, 12.414])

In [None]:
# Convert most_likely_set to whole numbers
rounded_most_likely_set = [round(x) for x in most_likely_set]


In [None]:
rounded_most_likely_set


[66, 22, 17, 64, 8, 12]

In [None]:
df[-1:]


Unnamed: 0,type,date,num1,num2,num3,num4,num5,num6,download_time
1712,Powerball,2024-11-16,38,25,22,21,32,16,"2024-11-17, 10:09:08 AM CST"


In [None]:
###################################################################################################################################


In [None]:
data = pd.read_excel("previous_data.xlsx")


In [None]:
(data.iloc[:, 1:]).describe()


In [None]:
(data[['1st_number', '2nd_number', '3rd_number', '4th_number', '5th_number', '6th_number']]).describe()


In [None]:
i = 0
while i < 10: 
    # Load the data from Excel file
    data = pd.read_excel(".\data\previous_data.xlsx")

    # Split the data into features (X) and target (y)
    X = data[['1st_number', '2nd_number', '3rd_number', '4th_number', '5th_number', '6th_number']]
    y = data.iloc[:, 1:]

    # Train a Random Forest Regression model
    model = RandomForestRegressor(n_estimators=1000, random_state=None)
    model.fit(X, y)

    # Generate a new set of random features for prediction
    new_data = pd.DataFrame({
        "1st_number": [randint(1, 70) for _ in range(100)],
        "2nd_number": [randint(1, 70) for _ in range(100)],
        "3rd_number": [randint(1, 70) for _ in range(100)],
        "4th_number": [randint(1, 70) for _ in range(100)],
        "5th_number": [randint(1, 70) for _ in range(100)],
        "6th_number": [randint(1, 25) for _ in range(100)],
    })

    # Use the trained model to predict the next 6 numbers for each set of features
    predictions = model.predict(new_data)

    # Get the most likely set of numbers based on the predictions
    most_likely_set = predictions[0]
    for p in predictions:
        if p[0] > most_likely_set[0]:
            most_likely_set = p

    # Convert most_likely_set to whole numbers
    rounded_most_likely_set = [round(x) for x in most_likely_set]

    # Print the most likely set of numbers
    print(str(f"{i+1:02d}") + ". The most likely set of numbers is:", rounded_most_likely_set)
    i += 1


01. The most likely set of numbers is: [38, 51, 57, 59, 64, 8]
02. The most likely set of numbers is: [36, 48, 54, 57, 62, 8]
03. The most likely set of numbers is: [37, 50, 56, 58, 64, 8]
04. The most likely set of numbers is: [38, 49, 56, 59, 64, 11]
05. The most likely set of numbers is: [36, 45, 54, 58, 65, 9]
06. The most likely set of numbers is: [34, 45, 51, 55, 60, 19]
07. The most likely set of numbers is: [36, 48, 54, 57, 63, 8]
08. The most likely set of numbers is: [38, 49, 54, 56, 63, 9]
09. The most likely set of numbers is: [38, 50, 57, 59, 64, 8]
10. The most likely set of numbers is: [37, 48, 53, 57, 63, 11]


In [None]:
# Load the data from Excel file
data = pd.read_excel("./data/previous_data.xlsx")

# Split the data into features (X) and target (y)
X1 = data[['1st_number', '2nd_number', '3rd_number', '4th_number', '5th_number', '6th_number']]
y1 = data.iloc[:, 1:]

# Train a Random Forest Regression model
model = RandomForestRegressor(n_estimators=1000, random_state=None)
model.fit(X1, y1)

# Generate a new set of random features for prediction
new_data = pd.DataFrame({
    "1st_number": [randint(1, 70) for _ in range(100)],
    "2nd_number": [randint(1, 70) for _ in range(100)],
    "3rd_number": [randint(1, 70) for _ in range(100)],
    "4th_number": [randint(1, 70) for _ in range(100)],
    "5th_number": [randint(1, 70) for _ in range(100)],
    "6th_number": [randint(1, 25) for _ in range(100)],
})


In [None]:
new_data


Unnamed: 0,1st_number,2nd_number,3rd_number,4th_number,5th_number,6th_number
0,45,36,10,19,20,25
1,43,53,70,6,65,5
2,1,62,40,46,12,23
3,11,65,47,3,60,25
4,15,60,63,53,45,7
...,...,...,...,...,...,...
95,45,19,7,26,2,4
96,13,7,21,11,48,19
97,49,35,68,20,67,21
98,40,55,45,6,6,1


In [None]:
# Use the trained model to predict the next 6 numbers for each set of features
predictions = model.predict(new_data)


In [None]:
predictions


array([[ 5.186,  9.651, 14.424, 18.231, 23.837, 18.603],
       [36.137, 47.497, 53.736, 56.251, 64.044,  8.061],
       [ 6.798, 37.243, 42.621, 46.244, 56.81 , 18.329],
       [ 9.548, 40.198, 45.648, 48.283, 63.566, 21.132],
       [12.38 , 46.986, 53.778, 58.244, 64.097,  9.05 ],
       [15.592, 21.42 , 38.725, 43.151, 52.41 , 14.539],
       [ 4.713, 11.372, 55.374, 60.117, 66.126,  3.865],
       [19.345, 24.85 , 38.148, 42.426, 49.943, 10.798],
       [18.589, 26.377, 31.152, 41.832, 46.453, 17.748],
       [16.122, 23.198, 27.83 , 55.385, 65.64 ,  8.378],
       [ 9.158, 16.344, 21.045, 27.955, 34.427,  7.469],
       [31.17 , 38.27 , 43.011, 47.721, 55.021, 14.607],
       [ 4.309,  8.056, 14.032, 17.788, 23.036,  8.283],
       [ 4.346,  7.462, 11.926, 19.272, 55.117, 15.408],
       [ 7.693, 13.042, 16.727, 24.057, 59.882,  7.818],
       [ 7.941, 12.135, 15.975, 25.154, 59.821, 14.319],
       [ 6.399, 11.71 , 18.922, 22.064, 28.313, 18.194],
       [10.873, 24.348, 40.61 ,

In [None]:
# single_occurence = {}
# regular_occurence = {}

# for record in allrecords:
#     for regnum in record:
#         if regnum in regular_occurence:
#             occurence = regular_occurence[regnum]
#             occurence += 1
#             regular_occurence[regnum] = occurence
#         else:
#             regular_occurence[regnum] = 1

# for pnum in num6:
#     if pnum in single_occurence:
#         occurence = single_occurence[pnum]
#         occurence += 1
#         single_occurence[pnum] = occurence
#     else:
#         single_occurence[pnum] = 1


# # print(regular_occurence)
# # regular_occurence_sorted = sorted(regular_occurence.items(), key= lambda x:x[1])
# regular_occurence_sorted = sorted(regular_occurence.items(), key= lambda x:x[1], reverse = True)
# single_occurence_sorted = sorted(single_occurence.items(), key = lambda x:x[1], reverse = True)
# # single_occurence_sorted 
# # regular_occurence_sorted

# # create a list of numbers sorted based on occurence
# regular_by_occurence = list(map(lambda x: x[0], regular_occurence_sorted ))
# single_by_occurence = list(map(lambda x: x[0], single_occurence_sorted))


# # single_by_occurence[:1][0]
# regular_by_occurence[:5]

# winningTicket = ' '.join(map(str, regular_by_occurence[:5] + single_by_occurence[:1]))
# print(f'Powerball most frequent nubers: {winningTicket}')


Powerball most frequent nubers: 39 36 32 21 23 24
