In [5]:
import os
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"
# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com] [Waiting for heade                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/re

In [6]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-08-29 20:22:21--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-08-29 20:22:22 (5.42 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NFL-Prediction").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [8]:
from pyspark import SparkFiles
url = "https://nflpredictionsdataviz.s3.amazonaws.com/spreadspoke_scores.csv"
spark.sparkContext.addFile(url)
NFL_Guru_df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get("spreadspoke_scores.csv"), sep=",", header=True, inferSchema=True)
NFL_Guru_df.show()

+-------------+---------------+-------------+----------------+-------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+
|schedule_date|schedule_season|schedule_week|schedule_playoff|          team_home|score_home|score_away|           team_away|team_favorite_id|spread_favorite|over_under_line|             stadium|stadium_neutral|weather_temperature|weather_wind_mph|weather_humidity|weather_detail|
+-------------+---------------+-------------+----------------+-------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+
|     9/2/1966|           1966|            1|           false|     Miami Dolphins|        14|        23|     Oakland Raiders|            null|           null

In [9]:
NFL_Guru_df.printSchema()

root
 |-- schedule_date: string (nullable = true)
 |-- schedule_season: integer (nullable = true)
 |-- schedule_week: string (nullable = true)
 |-- schedule_playoff: boolean (nullable = true)
 |-- team_home: string (nullable = true)
 |-- score_home: integer (nullable = true)
 |-- score_away: integer (nullable = true)
 |-- team_away: string (nullable = true)
 |-- team_favorite_id: string (nullable = true)
 |-- spread_favorite: double (nullable = true)
 |-- over_under_line: string (nullable = true)
 |-- stadium: string (nullable = true)
 |-- stadium_neutral: boolean (nullable = true)
 |-- weather_temperature: integer (nullable = true)
 |-- weather_wind_mph: integer (nullable = true)
 |-- weather_humidity: integer (nullable = true)
 |-- weather_detail: string (nullable = true)



In [10]:
NFL_guru_current = NFL_Guru_df.where(NFL_Guru_df.schedule_season>1978)
NFL_guru_current.show()

+-------------+---------------+-------------+----------------+--------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+
|schedule_date|schedule_season|schedule_week|schedule_playoff|           team_home|score_home|score_away|           team_away|team_favorite_id|spread_favorite|over_under_line|             stadium|stadium_neutral|weather_temperature|weather_wind_mph|weather_humidity|weather_detail|
+-------------+---------------+-------------+----------------+--------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+
|     9/1/1979|           1979|            1|           false|Tampa Bay Buccaneers|        31|        16|       Detroit Lions|              TB|           

In [18]:
from pyspark.sql import functions as F

NFL_guru_wins = NFL_guru_current.withColumn('home_win',
                                            F.when((F.col('score_home')>F.col('score_away')), 2)\
                                            .when((F.col('score_home')<F.col('score_away')), 1).otherwise(0)                                   
                                            )
NFL_guru_wins.show()

+-------------+---------------+-------------+----------------+--------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+--------+
|schedule_date|schedule_season|schedule_week|schedule_playoff|           team_home|score_home|score_away|           team_away|team_favorite_id|spread_favorite|over_under_line|             stadium|stadium_neutral|weather_temperature|weather_wind_mph|weather_humidity|weather_detail|home_win|
+-------------+---------------+-------------+----------------+--------------------+----------+----------+--------------------+----------------+---------------+---------------+--------------------+---------------+-------------------+----------------+----------------+--------------+--------+
|     9/1/1979|           1979|            1|           false|Tampa Bay Buccaneers|        31|        16|       Detroit Lions| 

In [22]:
cols = ('schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff', 'team_home', 'team_away', 'team_favorite_id', 'stadium_neutral','weather_humidity', 'over_under_line')
NFL_short = NFL_guru_wins.drop(*cols)
NFL_short.show()

+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|score_home|score_away|spread_favorite|             stadium|weather_temperature|weather_wind_mph|weather_detail|home_win|
+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|        31|        16|           -3.0|  Houlihan's Stadium|                 79|               9|          null|       2|
|         7|         9|           -5.0|Ralph Wilson Stadium|                 74|              15|          null|       1|
|         6|         3|           -3.0|       Soldier Field|                 78|              11|          null|       2|
|        10|         0|           -3.0|   Mile High Stadium|                 69|               6|          null|       2|
|        14|         0|           -1.0|   Arrowhead Stadium|                 76|               8|          null|       2|
|        17|        24| 

In [27]:
NFL_fix_weather = NFL_short.withColumn('weather_detail', F.when(NFL_short.weather_detail=='DOME','1').otherwise('0'))
NFL_fix_weather.show()

+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|score_home|score_away|spread_favorite|             stadium|weather_temperature|weather_wind_mph|weather_detail|home_win|
+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|        31|        16|           -3.0|  Houlihan's Stadium|                 79|               9|             0|       2|
|         7|         9|           -5.0|Ralph Wilson Stadium|                 74|              15|             0|       1|
|         6|         3|           -3.0|       Soldier Field|                 78|              11|             0|       2|
|        10|         0|           -3.0|   Mile High Stadium|                 69|               6|             0|       2|
|        14|         0|           -1.0|   Arrowhead Stadium|                 76|               8|             0|       2|
|        17|        24| 

In [28]:
NFL_clean = NFL_fix_weather
NFL_clean.show()

+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|score_home|score_away|spread_favorite|             stadium|weather_temperature|weather_wind_mph|weather_detail|home_win|
+----------+----------+---------------+--------------------+-------------------+----------------+--------------+--------+
|        31|        16|           -3.0|  Houlihan's Stadium|                 79|               9|             0|       2|
|         7|         9|           -5.0|Ralph Wilson Stadium|                 74|              15|             0|       1|
|         6|         3|           -3.0|       Soldier Field|                 78|              11|             0|       2|
|        10|         0|           -3.0|   Mile High Stadium|                 69|               6|             0|       2|
|        14|         0|           -1.0|   Arrowhead Stadium|                 76|               8|             0|       2|
|        17|        24| 

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
X = pd.get_dummies
y= df[""]