In [1]:

from pyspark.sql.functions import count, date_sub, current_date, split, lit
import pandas as pd
from decimal import Decimal
from lib import common_functions
from lib import configuration
#from lib.predict_corners import get_corner_predictions

from my_predictive_model.src.predict import get_corner_predictions

In [2]:
spark = common_functions.get_spark_session('corners')
spark.active()

In [3]:
df_corners = spark.read.option("header", True).load(f'{configuration.football_corners_input_path}/*.csv', format='csv')
#df_corners.show()
df_corners.count()

1364

Filter to make new DF's

In [4]:
df_corners_history = df_corners.select(df_corners["*"]).where(    (df_corners['1HC_Res'].isNotNull())    )
corners_history_ct = df_corners_history.count()
print(corners_history_ct)

1235


In [5]:
df_corners_history_aw = df_corners_history.select(df_corners_history["*"]).where(    (df_corners_history['Fixture'].contains('- AW'))    )
df_corners_history_aw_ct = df_corners_history_aw.count()
print(df_corners_history_aw_ct)

141


In [6]:
df_corners_predict = df_corners.select(df_corners["*"]).where(    (df_corners['1HC'].isNotNull()) & (df_corners['1HC_Res'].isNull())    )
corners_predict_ct = df_corners_predict.count()
print(corners_predict_ct)

29


## Calculate the count of rows where the specified columns are equal to 0

In [7]:
count_1HC_Res_0 = df_corners_history.filter(df_corners_history['1HC_Res'] == 0).count()
count_1AC_Res_0 = df_corners_history.filter(df_corners_history['1AC_Res'] == 0).count()
count_2HC_Res_0 = df_corners_history.filter(df_corners_history['2HC_Res'] == 0).count()
count_2AC_Res_0 = df_corners_history.filter(df_corners_history['2AC_Res'] == 0).count()
count_1HC_1AC_Res_0 = df_corners_history.filter((df_corners_history['1HC_Res'] == 0) & (df_corners_history['1AC_Res'] == 0)).count()
count_2HC_2AC_Res_0 = df_corners_history.filter((df_corners_history['2HC_Res'] == 0) & (df_corners_history['2AC_Res'] == 0)).count()
df_corners_history_filter_sum_less_3 = df_corners_history.filter((df_corners_history['1HC_Res'] + df_corners_history['1AC_Res']) < 3).count()
df_corners_history_filter_sum_less_4 = df_corners_history.filter((df_corners_history['2HC_Res'] + df_corners_history['2AC_Res']) < 4).count()


# Calculate the percentage
perc_1HC_Res_0 = (count_1HC_Res_0 / corners_history_ct) * 100
perc_1AC_Res_0 = (count_1AC_Res_0 / corners_history_ct) * 100
perc_2HC_Res_0 = (count_2HC_Res_0 / corners_history_ct) * 100
perc_2AC_Res_0 = (count_2AC_Res_0 / corners_history_ct) * 100
perc_1HC_1AC_Res_0 = (count_1HC_1AC_Res_0 / corners_history_ct) * 100
perc_2HC_2AC_Res_0 = (count_2HC_2AC_Res_0 / corners_history_ct) * 100
perc_sum_less_3 = (df_corners_history_filter_sum_less_3 / corners_history_ct) * 100
perc_sum_less_4 = (df_corners_history_filter_sum_less_4 / corners_history_ct) * 100


print(f'Percentage of 1HC_Res zero: {perc_1HC_Res_0}%')
print(f'Percentage of 1AC_Res zero: {perc_1AC_Res_0}%')
print(f'Percentage of 2HC_Res zero: {perc_2HC_Res_0}%')
print(f'Percentage of 2AC_Res zero: {perc_2AC_Res_0}%')
print(f'Percentage of 1HC_Res and 1AC_Res both zero: {perc_1HC_1AC_Res_0}%')
print(f'Percentage of 2HC_Res and 2AC_Res both zero: {perc_2HC_2AC_Res_0}%')
print(f'Percentage of rows where 1HC_Res + 1AC_Res < 3: {perc_sum_less_3}%')
print(f'Percentage of rows where 1HC_Res + 1AC_Res < 4: {perc_sum_less_4}%')

Percentage of 1HC_Res zero: 8.987854251012145%
Percentage of 1AC_Res zero: 16.356275303643724%
Percentage of 2HC_Res zero: 8.097165991902834%
Percentage of 2AC_Res zero: 13.360323886639677%
Percentage of 1HC_Res and 1AC_Res both zero: 0.8097165991902834%
Percentage of 2HC_Res and 2AC_Res both zero: 0.8906882591093117%
Percentage of rows where 1HC_Res + 1AC_Res < 3: 14.736842105263156%
Percentage of rows where 1HC_Res + 1AC_Res < 4: 23.481781376518217%


## Get Past Month History for re-use.

In [8]:
# Filter data from the previous month
df_corners_history_prev_month = df_corners_history.filter(df_corners_history['InsertDate'] >= date_sub(current_date(), 15))
df_corners_history_prev_month_ct = df_corners_history_prev_month.count()
print(df_corners_history_prev_month_ct)

139


In [9]:
# Split the Fixture column into home and away teams
df_split = df_corners_history_prev_month.withColumn('HomeTeam', split(df_corners_history_prev_month['Fixture'], ' vs ')[0]) \
                                        .withColumn('AwayTeam', split(df_corners_history_prev_month['Fixture'], ' vs ')[1])

# Select the relevant columns for home and away DataFrames
df_corners_history_home = df_split.select('HomeTeam','1HC','1AC','2HC','2AC','1HCA','1ACA','2HCA','2ACA')
df_corners_history_away = df_split.select('AwayTeam','1HC','1AC','2HC','2AC','1HCA','1ACA','2HCA','2ACA')

# Replace 1AC contents with '-'
df_corners_history_home = df_corners_history_home.withColumn('1AC', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('2AC', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('1ACA', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('2ACA', lit('-'))

df_corners_history_away = df_corners_history_away.withColumn('1HC', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('2HC', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('1HCA', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('2HCA', lit('-'))

## Train and export predictions using sklearn

In [10]:
get_corner_predictions(df_corners_history.toPandas(),df_corners_predict.toPandas())

Prediction dataset with results saved successfully!


In [11]:
df_corners_history_more_two = df_corners_history.select(df_corners["*"]).where(    (df_corners['WinOverTwo'] == 1)    )
corners_history_more_two_ct = df_corners_history_more_two.count()
print(corners_history_more_two_ct)
print(corners_history_more_two_ct/corners_history_ct*100)

210
17.00404858299595


In [12]:
avg_cols = {'1HC': 'avg', '1AC': 'avg', '2HC': 'avg', '2AC': 'avg', '1HCA': 'avg', '1ACA': 'avg', '2HCA': 'avg', '2ACA': 'avg', 'PastForTotal': 'avg', 'PastAgainstTotal': 'avg' }

df_corners_history_more_two_avg = df_corners_history_more_two.agg(avg_cols)
df_c2 = df_corners_history_more_two_avg

#df_corners_history_more_two_avg.select(df_c2['avg(1HC)'],df_c2['avg(1AC)'],df_c2['avg(2HC)'],df_c2['avg(2AC)'],df_c2['avg(1HCA)'],df_c2['avg(1ACA)'],df_c2['avg(2HCA)'],df_c2['avg(2ACA)'],df_c2['avg(PastForTotal)'],df_c2['avg(PastAgainstTotal)']).show()

## Predict over averages

In [13]:
row = df_corners_history_more_two_avg.first()
perc = 0.9

avg_1HC = Decimal(row['avg(1HC)'] * perc).quantize(Decimal("0.00")) 
avg_1AC = Decimal(row['avg(1AC)'] * perc).quantize(Decimal("0.00"))
avg_2HC = Decimal(row['avg(2HC)'] * perc).quantize(Decimal("0.00"))
avg_2AC = Decimal(row['avg(2AC)'] * perc).quantize(Decimal("0.00"))
avg_1HCA = Decimal(row['avg(1HCA)'] * perc).quantize(Decimal("0.00"))
avg_1ACA = Decimal(row['avg(1ACA)'] * perc).quantize(Decimal("0.00"))
avg_2HCA = Decimal(row['avg(2HCA)'] * perc).quantize(Decimal("0.00"))
avg_2ACA = Decimal(row['avg(2ACA)'] * perc).quantize(Decimal("0.00"))
avg_PastForTotal = Decimal(row['avg(PastForTotal)'] * perc).quantize(Decimal("0.00"))
avg_PastAgainstTotal = Decimal(row['avg(PastAgainstTotal)'] * perc).quantize(Decimal("0.00"))

print(f'avg_1HC - {avg_1HC} / avg_1AC - {avg_1AC} / avg_2HC - {avg_2HC} / avg_2AC - {avg_2AC} / avg_PastForTotal - {avg_PastForTotal} / avg_PastAgainstTotal - {avg_PastAgainstTotal}')

print(f'avg_1HCA - {avg_1HCA} / avg_1ACA - {avg_1ACA} / avg_2HCA - {avg_2HCA} / avg_2ACA - {avg_2ACA}')

avg_1HC - 2.58 / avg_1AC - 2.25 / avg_2HC - 2.76 / avg_2AC - 2.44 / avg_PastForTotal - 10.03 / avg_PastAgainstTotal - 9.70
avg_1HCA - 2.00 / avg_1ACA - 2.53 / avg_2HCA - 2.33 / avg_2ACA - 2.85


In [14]:
# (df_corners_history['1HCA'] > avg_1HCA) & (df_corners_history['1ACA'] > avg_1ACA) & (df_corners_history['2HCA'] > avg_2HCA) & (df_corners_history['2ACA'] > avg_2ACA)

df_pick_over_pred = df_corners_predict.select(df_corners_history["*"]).where(    (df_corners_history['1HC'] >= avg_1HC) & (df_corners_history['1AC'] >= avg_1AC) & (df_corners_history['2HC'] >= avg_2HC) & (df_corners_history['2AC'] >= avg_2AC)    )
df_pick_over_pred_ct = df_pick_over_pred.count()
print(df_pick_over_pred_ct)
df_pick_over_pred.show()

4
+-----+--------------------+----+----+----+----+-------+-------+-------+-------+----+----+----+----+----------+----------+------------+-------------+-------------+-------------+------------+----------------+----------------+-----------------+-----------+----------------+-----------------+---------------+----------------+--------------+----------------+---------------------+------------------+------------+-------------+-----------+------------+----------+----------+-----------+-----------+
|   ID|             Fixture| 1HC| 1AC| 2HC| 2AC|1HC_Res|1AC_Res|2HC_Res|2AC_Res|1HCA|1ACA|2HCA|2ACA|windrawwin|InsertDate|PastForFirst|PastForSecond|PastHomeTotal|PastAwayTotal|PastForTotal|PastAgainstTotal|ResultTotalFirst|ResultTotalSecond|ResultTotal|HistoryOverFirst|HistoryOverSecond|HistoryOverBoth|HistoryOverTotal|HistoryOverTwo|HistoryOverThree|HistoryAgainstOverTwo|HistoryOverTwoBoth|WinOverFirst|WinOverSecond|WinOverBoth|WinOverTotal|WinOverOne|WinOverTwo|WinMostHome|WinMostAway|
+-----+---

## Predict over 2

## Predict over 2.5

## Write to excel

In [15]:
with pd.ExcelWriter(f'{configuration.football_corners_output_path}/corners.xlsx') as writer:  
     df_corners_history_more_two_avg.toPandas().to_excel(writer, sheet_name=f'over_two_avg', index=False)
     df_pick_over_pred.toPandas().to_excel(writer, sheet_name=f'pick_over_pred - {perc}%', index=False)
     df_corners_history_more_two.toPandas().to_excel(writer, sheet_name=f'history_more_two', index=False)

     df_corners_history_home.toPandas().to_excel(writer, sheet_name=f'df_corners_history_home', index=False)
     df_corners_history_away.toPandas().to_excel(writer, sheet_name=f'df_corners_history_away', index=False)