In [49]:

from pyspark.sql.functions import count, date_sub, current_date, split, lit, when
import pandas as pd
import shutil
from decimal import Decimal
from lib import common_functions
from lib import configuration
#from lib.predict_corners import get_corner_predictions

from my_predictive_model.src.predict import get_corner_predictions

In [50]:
spark = common_functions.get_spark_session('corners')
spark.active()

In [51]:
df_corners = spark.read.option("header", True).load(f'{configuration.football_corners_input_path}/*.csv', format='csv')

df_corners = df_corners.withColumn('HomeAway', when(df_corners['Fixture'].contains('- AW'), 'AWAY').otherwise('HOME'))

df_corners = df_corners.withColumn('HomeTeam', split(df_corners['Fixture'], ' vs ')[0]) \
                       .withColumn('AwayTeam', split(df_corners['Fixture'], ' vs ')[1])


#df_corners.show(5)
df_corners.count()

1609

Filter to make new DF's

In [52]:
df_corners_history = df_corners.select(df_corners["*"]).where(    (df_corners['1HC_Res'].isNotNull())    )
corners_history_ct = df_corners_history.count()
print(corners_history_ct)

1509


In [53]:
df_corners_history_aw = df_corners_history.select(df_corners_history["*"]).where(    (df_corners_history['Fixture'].contains('- AW'))    )
df_corners_history_aw_ct = df_corners_history_aw.count()
print(df_corners_history_aw_ct)

231


In [54]:
df_corners_predict = df_corners.select(df_corners["*"]).where(    (df_corners['1AC'].isNotNull()) & (df_corners['1AC_Res'].isNull())    )
corners_predict_ct = df_corners_predict.count()
print(corners_predict_ct)

20


## Calculate the count of rows where the specified columns are equal to 0

In [55]:
count_1HC_Res_0 = df_corners_history.filter(df_corners_history['1HC_Res'] == 0).count()
count_1AC_Res_0 = df_corners_history.filter(df_corners_history['1AC_Res'] == 0).count()
count_2HC_Res_0 = df_corners_history.filter(df_corners_history['2HC_Res'] == 0).count()
count_2AC_Res_0 = df_corners_history.filter(df_corners_history['2AC_Res'] == 0).count()
count_1HC_1AC_Res_0 = df_corners_history.filter((df_corners_history['1HC_Res'] == 0) & (df_corners_history['1AC_Res'] == 0)).count()
count_2HC_2AC_Res_0 = df_corners_history.filter((df_corners_history['2HC_Res'] == 0) & (df_corners_history['2AC_Res'] == 0)).count()
df_corners_history_filter_sum_less_3 = df_corners_history.filter((df_corners_history['1HC_Res'] + df_corners_history['1AC_Res']) < 3).count()
df_corners_history_filter_sum_less_4 = df_corners_history.filter((df_corners_history['2HC_Res'] + df_corners_history['2AC_Res']) < 4).count()


# Calculate the percentage
perc_1HC_Res_0 = (count_1HC_Res_0 / corners_history_ct) * 100
perc_1AC_Res_0 = (count_1AC_Res_0 / corners_history_ct) * 100
perc_2HC_Res_0 = (count_2HC_Res_0 / corners_history_ct) * 100
perc_2AC_Res_0 = (count_2AC_Res_0 / corners_history_ct) * 100
perc_1HC_1AC_Res_0 = (count_1HC_1AC_Res_0 / corners_history_ct) * 100
perc_2HC_2AC_Res_0 = (count_2HC_2AC_Res_0 / corners_history_ct) * 100
perc_sum_less_3 = (df_corners_history_filter_sum_less_3 / corners_history_ct) * 100
perc_sum_less_4 = (df_corners_history_filter_sum_less_4 / corners_history_ct) * 100


print(f'Percentage of 1HC_Res zero: {perc_1HC_Res_0}%')
print(f'Percentage of 1AC_Res zero: {perc_1AC_Res_0}%')
print(f'Percentage of 2HC_Res zero: {perc_2HC_Res_0}%')
print(f'Percentage of 2AC_Res zero: {perc_2AC_Res_0}%')
print(f'Percentage of 1HC_Res and 1AC_Res both zero: {perc_1HC_1AC_Res_0}%')
print(f'Percentage of 2HC_Res and 2AC_Res both zero: {perc_2HC_2AC_Res_0}%')
print(f'Percentage of rows where 1HC_Res + 1AC_Res < 3: {perc_sum_less_3}%')
print(f'Percentage of rows where 1HC_Res + 1AC_Res < 4: {perc_sum_less_4}%')

Percentage of 1HC_Res zero: 9.675281643472498%
Percentage of 1AC_Res zero: 15.838303512259774%
Percentage of 2HC_Res zero: 8.28363154406892%
Percentage of 2AC_Res zero: 12.856196156394963%
Percentage of 1HC_Res and 1AC_Res both zero: 0.8614976805831678%
Percentage of 2HC_Res and 2AC_Res both zero: 0.9277667329357191%
Percentage of rows where 1HC_Res + 1AC_Res < 3: 15.440689198144467%
Percentage of rows where 1HC_Res + 1AC_Res < 4: 24.652087475149106%


## Get Past Month History for re-use.

In [56]:
# Filter data from the previous month
df_corners_history_prev_month = df_corners_history.filter(df_corners_history['InsertDate'] >= date_sub(current_date(), 15))
df_corners_history_prev_month_ct = df_corners_history_prev_month.count()
print(df_corners_history_prev_month_ct)

93


In [57]:
# Select the relevant columns for home and away DataFrames
df_corners_history_home = df_corners_history_prev_month.select('HomeTeam','1HC','1AC','2HC','2AC','1HCA','1ACA','2HCA','2ACA')
df_corners_history_away = df_corners_history_prev_month.select('AwayTeam','1HC','1AC','2HC','2AC','1HCA','1ACA','2HCA','2ACA')

# Replace 1AC contents with '-'
df_corners_history_home = df_corners_history_home.withColumn('1AC', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('2AC', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('1ACA', lit('-'))
df_corners_history_home = df_corners_history_home.withColumn('2ACA', lit('-'))

df_corners_history_away = df_corners_history_away.withColumn('1HC', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('2HC', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('1HCA', lit('-'))
df_corners_history_away = df_corners_history_away.withColumn('2HCA', lit('-'))

## Train and export predictions using sklearn

In [58]:
get_corner_predictions(df_corners_history.toPandas(),df_corners_predict.toPandas())

Prediction dataset with results saved successfully!


In [59]:
df_corners_history_more_two = df_corners_history.select(df_corners["*"]).where(    (df_corners['WinOverTwo'] == 1)    )
corners_history_more_two_ct = df_corners_history_more_two.count()
print(corners_history_more_two_ct)
print(corners_history_more_two_ct/corners_history_ct*100)

255
16.898608349900595


In [60]:
avg_cols = {'1HC': 'avg', '1AC': 'avg', '2HC': 'avg', '2AC': 'avg', '1HCA': 'avg', '1ACA': 'avg', '2HCA': 'avg', '2ACA': 'avg', 'PastForTotal': 'avg', 'PastAgainstTotal': 'avg' }

df_corners_history_more_two_avg = df_corners_history_more_two.agg(avg_cols)
df_c2 = df_corners_history_more_two_avg

#df_corners_history_more_two_avg.select(df_c2['avg(1HC)'],df_c2['avg(1AC)'],df_c2['avg(2HC)'],df_c2['avg(2AC)'],df_c2['avg(1HCA)'],df_c2['avg(1ACA)'],df_c2['avg(2HCA)'],df_c2['avg(2ACA)'],df_c2['avg(PastForTotal)'],df_c2['avg(PastAgainstTotal)']).show()

## Predict over averages

In [61]:
row = df_corners_history_more_two_avg.first()
perc = 0.9

avg_1HC = Decimal(row['avg(1HC)'] * perc).quantize(Decimal("0.00")) 
avg_1AC = Decimal(row['avg(1AC)'] * perc).quantize(Decimal("0.00"))
avg_2HC = Decimal(row['avg(2HC)'] * perc).quantize(Decimal("0.00"))
avg_2AC = Decimal(row['avg(2AC)'] * perc).quantize(Decimal("0.00"))
avg_1HCA = Decimal(row['avg(1HCA)'] * perc).quantize(Decimal("0.00"))
avg_1ACA = Decimal(row['avg(1ACA)'] * perc).quantize(Decimal("0.00"))
avg_2HCA = Decimal(row['avg(2HCA)'] * perc).quantize(Decimal("0.00"))
avg_2ACA = Decimal(row['avg(2ACA)'] * perc).quantize(Decimal("0.00"))
avg_PastForTotal = Decimal(row['avg(PastForTotal)'] * perc).quantize(Decimal("0.00"))
avg_PastAgainstTotal = Decimal(row['avg(PastAgainstTotal)'] * perc).quantize(Decimal("0.00"))

print(f'avg_1HC - {avg_1HC} / avg_1AC - {avg_1AC} / avg_2HC - {avg_2HC} / avg_2AC - {avg_2AC} / avg_PastForTotal - {avg_PastForTotal} / avg_PastAgainstTotal - {avg_PastAgainstTotal}')

print(f'avg_1HCA - {avg_1HCA} / avg_1ACA - {avg_1ACA} / avg_2HCA - {avg_2HCA} / avg_2ACA - {avg_2ACA}')

avg_1HC - 2.55 / avg_1AC - 2.19 / avg_2HC - 2.74 / avg_2AC - 2.41 / avg_PastForTotal - 9.89 / avg_PastAgainstTotal - 9.57
avg_1HCA - 2.01 / avg_1ACA - 2.47 / avg_2HCA - 2.30 / avg_2ACA - 2.79


In [62]:
# (df_corners_history['1HCA'] > avg_1HCA) & (df_corners_history['1ACA'] > avg_1ACA) & (df_corners_history['2HCA'] > avg_2HCA) & (df_corners_history['2ACA'] > avg_2ACA)

df_pick_over_pred = df_corners_predict.select(df_corners_history["*"]).where(    (df_corners_history['1HC'] >= avg_1HC) & (df_corners_history['1AC'] >= avg_1AC) & (df_corners_history['2HC'] >= avg_2HC) & (df_corners_history['2AC'] >= avg_2AC)    )
df_pick_over_pred_ct = df_pick_over_pred.count()
print(df_pick_over_pred_ct)
df_pick_over_pred.show()

2
+-----+--------------------+----+----+----+----+-------+-------+-------+-------+----+----+----+----+----------+----------+------------+-------------+-------------+-------------+------------+----------------+----------------+-----------------+-----------+----------------+-----------------+---------------+----------------+--------------+----------------+---------------------+------------------+------------+-------------+-----------+------------+----------+----------+-----------+-----------+--------+------------+------------+
|   ID|             Fixture| 1HC| 1AC| 2HC| 2AC|1HC_Res|1AC_Res|2HC_Res|2AC_Res|1HCA|1ACA|2HCA|2ACA|windrawwin|InsertDate|PastForFirst|PastForSecond|PastHomeTotal|PastAwayTotal|PastForTotal|PastAgainstTotal|ResultTotalFirst|ResultTotalSecond|ResultTotal|HistoryOverFirst|HistoryOverSecond|HistoryOverBoth|HistoryOverTotal|HistoryOverTwo|HistoryOverThree|HistoryAgainstOverTwo|HistoryOverTwoBoth|WinOverFirst|WinOverSecond|WinOverBoth|WinOverTotal|WinOverOne|WinOverTwo|

## Get historic teams with highest averages

In [63]:
df_corners_history.createOrReplaceTempView("corners_split")

In [83]:
fixtures = """
West Brom vs Hull City
SV Elversberg vs Preussen Munster
Waalwijk vs PSV Eindhoven
Heart OF Midlothian vs Ross County
Kilmarnock vs Hibernian
Arsenal vs Chelsea
Fenerbahce vs Samsunspor
Guimaraes vs Estrela
"""

# Split the fixtures by ' vs ' and add to a dictionary
fixtures_dict = {}
for line in fixtures.strip().split('\n'):
    teams = line.split(' vs ')
    if len(teams) == 2:
        fixtures_dict[teams[0].strip()] = teams[1].strip()

print(fixtures_dict)

{'West Brom': 'Hull City', 'SV Elversberg': 'Preussen Munster', 'Waalwijk': 'PSV Eindhoven', 'Heart OF Midlothian': 'Ross County', 'Kilmarnock': 'Hibernian', 'Arsenal': 'Chelsea', 'Fenerbahce': 'Samsunspor', 'Guimaraes': 'Estrela'}


In [84]:
results = []

for home_team, away_team in fixtures_dict.items():
    df_compare = spark.sql(
        f"SELECT HomeTeam AS Team, COUNT(*) AS CT, avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
        avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
        avg(1HC_Res) + avg(2HC_Res) AS Tot, \
        avg(1HC) AS 1HC, avg(2HC) AS 2HC, avg(1HCA) AS 1HCA, avg(2HCA) AS 2HCA, \
        sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
        sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
        FROM corners_split \
        WHERE ID LIKE '%,%' AND HomeTeam = '{home_team}' \
        GROUP BY HomeTeam \
        UNION ALL \
        SELECT AwayTeam AS Team, COUNT(*) AS CT, avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
        avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
        avg(1AC_Res) + avg(2AC_Res) AS Tot, \
        avg(1AC) AS 1AC, avg(2AC) AS 2AC, avg(1ACA) AS 1ACA, avg(2ACA) AS 2ACA, \
        sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
        sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
        FROM corners_split \
        WHERE ID LIKE '%,%' AND AwayTeam = '{away_team}' \
        GROUP BY AwayTeam"
    )
    results.append(df_compare)

df_final_compare = results[0]
for df in results[1:]:
    df_final_compare = df_final_compare.union(df)

df_final_compare.toPandas().to_excel('files/output/football/corners/_temp.xlsx', index=False)

In [86]:
df_compare = spark.sql(
    "SELECT HomeTeam AS Team, COUNT(*) AS CT, avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
    avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
    avg(1HC_Res) + avg(2HC_Res) AS Tot, \
    avg(1HC) AS 1HC, avg(2HC) AS 2HC, avg(1HCA) AS 1HCA, avg(2HCA) AS 2HCA, \
    sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
    sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
    FROM corners_split \
    WHERE ID LIKE '%,%' AND HomeTeam like '%Queen%' \
    GROUP BY HomeTeam \
    UNION ALL \
    SELECT AwayTeam AS Team, COUNT(*) AS CT, avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
    avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
    avg(1AC_Res) + avg(2AC_Res) AS Tot, \
    avg(1AC) AS 1AC, avg(2AC) AS 2AC, avg(1ACA) AS 1ACA, avg(2ACA) AS 2ACA, \
    sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
    sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
    FROM corners_split \
    WHERE ID LIKE '%,%' AND AwayTeam = 'Falkirk - AW' \
    GROUP BY AwayTeam"
    )

df_compare.toPandas().to_excel('files/output/football/corners/_temp.xlsx', index=False)

In [65]:
games_count = 2

In [66]:
df_corners_history_avg_home = spark.sql(
    f"SELECT 'avg_home' AS WhichTab, HomeTeam, MAX(ID) AS ID, COUNT(*) AS CT, avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
    avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
    avg(1HC_Res) + avg(2HC_Res) AS Tot, \
    avg(1HC) AS 1HC, avg(2HC) AS 2HC, avg(1HCA) AS 1HCA, avg(2HCA) AS 2HCA, \
    sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
    sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
    FROM corners_split \
    WHERE ID LIKE '%,%' \
    GROUP BY HomeTeam \
    HAVING CT > {games_count} AND Tot >= 5.0 \
    ORDER BY Tot DESC"
    )#.show(20)

In [67]:
df_corners_history_avg_away_fav = spark.sql(
    f"SELECT 'avg_away_fav' AS WhichTab, AwayTeam, MAX(ID) AS ID, COUNT(*) AS CT, avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
    avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
    avg(1AC_Res) + avg(2AC_Res) AS Tot, \
    avg(1AC) AS 1AC, avg(2AC) AS 2AC, avg(1ACA) AS 1ACA, avg(2ACA) AS 2ACA, \
    sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
    sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
    FROM corners_split \
    WHERE ID LIKE '%,%' AND FIXTURE LIKE '%- AW' \
    GROUP BY AwayTeam \
    HAVING CT > 2 AND Tot >= 5.0 \
    ORDER BY Tot DESC"
    )#.show(20)

In [68]:
df_corners_history_avg_away = spark.sql(
    f"SELECT 'avg_away' AS WhichTab, AwayTeam, MAX(ID) AS ID, COUNT(*) AS CT, avg(1AC_Res) AS 1AC_Res, avg(2AC_Res) AS 2AC_Res, \
    avg(1HC_Res) AS 1HC_Res, avg(2HC_Res) AS 2HC_Res, \
    avg(1AC_Res) + avg(2AC_Res) AS Tot, \
    avg(1AC) AS 1AC, avg(2AC) AS 2AC, avg(1ACA) AS 1ACA, avg(2ACA) AS 2ACA, \
    sum(WinOverFirst)/COUNT(*)*100 AS WinOverFirst,sum(WinOverBoth)/COUNT(*)*100 AS WinOverBoth,sum(WinOverTotal)/COUNT(*)*100 AS WinOverTotal, \
    sum(WinOverOne)/COUNT(*)*100 AS WinOverOne,sum(WinOverTwo)/COUNT(*)*100 AS WinOverTwo \
    FROM corners_split \
    WHERE ID LIKE '%,%' \
    GROUP BY AwayTeam \
    HAVING CT > {games_count} AND Tot >= 5.0 \
    ORDER BY Tot DESC"
    )#.show(20)

## Lookup predictions against past high avgs

In [69]:
df_intersect_home = df_corners_predict.join(df_corners_history_avg_home, df_corners_predict.HomeTeam == df_corners_history_avg_home.HomeTeam, 'inner')
df_intersect_away_fav = df_corners_predict.join(df_corners_history_avg_away_fav, df_corners_predict.AwayTeam == df_corners_history_avg_away_fav.AwayTeam, 'inner')
df_intersect_away = df_corners_predict.join(df_corners_history_avg_away, df_corners_predict.AwayTeam == df_corners_history_avg_away.AwayTeam, 'inner')

df_intersect = df_intersect_home.union(df_intersect_away_fav).union(df_intersect_away)
df_intersect.show()

+-----+--------------------+----+----+----+----+-------+-------+-------+-------+----+----+----+----+----------+----------+------------+-------------+-------------+-------------+------------+----------------+----------------+-----------------+-----------+----------------+-----------------+---------------+----------------+--------------+----------------+---------------------+------------------+------------+-------------+-----------+------------+----------+----------+-----------+-----------+--------+-------------------+----------------+------------+-------------------+-----+---+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|   ID|             Fixture| 1HC| 1AC| 2HC| 2AC|1HC_Res|1AC_Res|2HC_Res|2AC_Res|1HCA|1ACA|2HCA|2ACA|windrawwin|InsertDate|PastForFirst|PastForSecond|PastHome

## Write to excel

In [70]:
with pd.ExcelWriter(f'{configuration.football_corners_output_path}/corners.xlsx') as writer:  
     
     df_corners_history_avg_home.toPandas().to_excel(writer, sheet_name=f'df_corners_history_avg_home', index=False)
     df_corners_history_avg_away_fav.toPandas().to_excel(writer, sheet_name=f'df_corners_history_avg_away_fav', index=False)
     df_corners_history_avg_away.toPandas().to_excel(writer, sheet_name=f'df_corners_history_avg_away', index=False)
     df_intersect.toPandas().to_excel(writer, sheet_name=f'df_intersect', index=False)
     
     df_corners_history_more_two_avg.toPandas().to_excel(writer, sheet_name=f'over_two_avg', index=False)
     df_pick_over_pred.toPandas().to_excel(writer, sheet_name=f'pick_over_pred - {perc}%', index=False)
     df_corners_history_more_two.toPandas().to_excel(writer, sheet_name=f'history_more_two', index=False)

     #df_corners_history_home.toPandas().to_excel(writer, sheet_name=f'df_corners_history_home', index=False)
     #df_corners_history_away.toPandas().to_excel(writer, sheet_name=f'df_corners_history_away', index=False) 


## Make Picks from - df_corners_predict

In [71]:
df_temp_table = df_corners_history.select(df_corners["*"]).where(    (df_corners['windrawwin'].isNotNull())   )

In [72]:
df_temp_table.createOrReplaceTempView("corners")
df_corners_predict.createOrReplaceTempView("corners_predict")

In [73]:
what_to_predict = 'WinOverFirst'

In [74]:
result = spark.sql(f"SELECT avg(1HC) as avg_1HC, avg(1AC) as avg_1AC, avg(2HC) as avg_2HC, avg(2AC) as avg_2AC, avg(1HCA) as avg_1HCA, avg(1ACA) as avg_1ACA, avg(2HCA) as avg_2HCA, avg(2ACA) as avg_2ACA, avg(PastForTotal) as avg_PastForTotal, avg(PastAgainstTotal) as avg_PastAgainstTotal, SUM({what_to_predict})/count(*)*100 AS PERC FROM corners").first()

avg_1HC = Decimal(result['avg_1HC']).quantize(Decimal("0.00"))
avg_1AC = Decimal(result['avg_1AC']).quantize(Decimal("0.00"))
avg_2HC = Decimal(result['avg_2HC']).quantize(Decimal("0.00"))
avg_2AC = Decimal(result['avg_2AC']).quantize(Decimal("0.00"))
avg_1HCA = Decimal(result['avg_1HCA']).quantize(Decimal("0.00"))
avg_1ACA = Decimal(result['avg_1ACA']).quantize(Decimal("0.00"))
avg_2HCA = Decimal(result['avg_2HCA']).quantize(Decimal("0.00"))
avg_2ACA = Decimal(result['avg_2ACA']).quantize(Decimal("0.00"))
avg_PastForTotal = Decimal(result['avg_PastForTotal']).quantize(Decimal("0.00"))
avg_PastAgainstTotal = Decimal(result['avg_PastAgainstTotal']).quantize(Decimal("0.00"))

print(f'avg_1HC: {avg_1HC}, avg_1AC: {avg_1AC}, avg_2HC: {avg_2HC}, avg_2AC: {avg_2AC}')
print(f'avg_1HCA: {avg_1HCA}, avg_1ACA: {avg_1ACA}, avg_2HCA: {avg_2HCA}, avg_2ACA: {avg_2ACA}')
print(f'avg_PastForTotal: {avg_PastForTotal}, avg_PastAgainstTotal: {avg_PastAgainstTotal}')

avg_1HC: 2.74, avg_1AC: 2.23, avg_2HC: 3.04, avg_2AC: 2.55
avg_1HCA: 2.22, avg_1ACA: 2.64, avg_2HCA: 2.51, avg_2ACA: 3.01
avg_PastForTotal: 10.57, avg_PastAgainstTotal: 10.38


In [75]:
spark.sql(f"SELECT HomeAway,windrawwin,SUM({what_to_predict}),count(*) AS TOT,SUM({what_to_predict})/count(*)*100 AS PERC FROM corners GROUP BY HomeAway,windrawwin HAVING PERC > 85 AND TOT > 10 ORDER BY PERC DESC").show()

+--------+----------+-----------------+---+-----------------+
|HomeAway|windrawwin|sum(WinOverFirst)|TOT|             PERC|
+--------+----------+-----------------+---+-----------------+
|    AWAY|       1-3|             12.0| 14|85.71428571428571|
+--------+----------+-----------------+---+-----------------+



In [76]:
spark.sql(f"SELECT * FROM corners_predict WHERE windrawwin IN ('2-2') AND HomeAway = 'AWAY' AND 1HC >= {avg_1HC}*0.8 AND 1AC >= {avg_1AC}*0.8").show(50)

+---+-------+---+---+---+---+-------+-------+-------+-------+----+----+----+----+----------+----------+------------+-------------+-------------+-------------+------------+----------------+----------------+-----------------+-----------+----------------+-----------------+---------------+----------------+--------------+----------------+---------------------+------------------+------------+-------------+-----------+------------+----------+----------+-----------+-----------+--------+--------+--------+
| ID|Fixture|1HC|1AC|2HC|2AC|1HC_Res|1AC_Res|2HC_Res|2AC_Res|1HCA|1ACA|2HCA|2ACA|windrawwin|InsertDate|PastForFirst|PastForSecond|PastHomeTotal|PastAwayTotal|PastForTotal|PastAgainstTotal|ResultTotalFirst|ResultTotalSecond|ResultTotal|HistoryOverFirst|HistoryOverSecond|HistoryOverBoth|HistoryOverTotal|HistoryOverTwo|HistoryOverThree|HistoryAgainstOverTwo|HistoryOverTwoBoth|WinOverFirst|WinOverSecond|WinOverBoth|WinOverTotal|WinOverOne|WinOverTwo|WinMostHome|WinMostAway|HomeAway|HomeTeam|AwayTe