Skip to content

Commit

Permalink
completed scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonchanhku committed Feb 15, 2019
1 parent 500f08b commit da4c3ab
Show file tree
Hide file tree
Showing 8 changed files with 457 additions and 1,253 deletions.
611 changes: 0 additions & 611 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb

This file was deleted.

7 changes: 7 additions & 0 deletions .ipynb_checkpoints/requirements-checkpoint.txt
@@ -0,0 +1,7 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/python
beautifulsoup4==4.6.0
pandas==0.23.0
numpy==1.14.3
requests==2.18.4
1 change: 1 addition & 0 deletions .ipynb_checkpoints/runtime-checkpoint.txt
@@ -0,0 +1 @@
python-3.6.2
222 changes: 222 additions & 0 deletions .ipynb_checkpoints/scraper-checkpoint.py
@@ -0,0 +1,222 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import string
import re
import datetime
import sqlite3
import time

all_links = []
location = []
events = []
f1 = []
f2 = []
winner = []
f1_odds = []
f2_odds = []
label = []
favourite = []

def scrape_data():
# set up page to extract table
data = requests.get("https://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1")
soup = BeautifulSoup(data.text, 'html.parser')

# table with 98% width
table = soup.find('table', {'width': "98%"})
# find all links in that table
links = table.find_all('a', href=True)

# append all links to a list
for link in links:
all_links.append("https://www.betmma.tips/"+link.get('href'))

# test for one use case
for link in all_links:
print(f"Now currently scraping link: {link}")

data = requests.get(link)
soup = BeautifulSoup(data.text, 'html.parser')
time.sleep(2)
# specific table with the information
rows = soup.find_all('table', {'cellspacing': "5"})

for row in rows:

# check for draw, if draw, then skip
# dictionary of won and lost
odds = row.find_all('td', {'align': "center", 'valign': "middle"})
# to avoid taking in draws
if odds[0].text not in ['WON', 'LOST']:
continue

# event name
h1 = soup.find("h1")
# location and date
h2 = soup.find("h2")

events.append(h1.text)
location.append(h2.text)

odds_f1 = float(odds[2].text.strip(" @"))
odds_f2 = float(odds[3].text.strip(" @"))

f1_odds.append(odds_f1)
f2_odds.append(odds_f2)

# how to generate label
odds_dict = {}
odds_dict[odds[0].text] = odds_f1
odds_dict[odds[1].text] = odds_f2

if odds_dict["WON"] > odds_dict["LOST"]:
label.append("Underdog")
else:
label.append("Favourite")

if odds_f1 > odds_f2:
favourite.append("f2")
else:
favourite.append("f1")


fighters = row.find_all('a', attrs={'href': re.compile("^fighter_profile.php")})
f1.append(fighters[0].text)
f2.append(fighters[1].text)
winner.append(fighters[2].text)
return None

def create_df():

# creating dataframe
df = pd.DataFrame()
df["Events"] = events
df["Location"] = location
df["Fighter1"] = f1
df["Fighter2"] = f2
df["Winner"] = winner
df["fighter1_odds"] = f1_odds
df["fighter2_odds"] = f2_odds
df["Favourite"] = favourite
df["Label"] = label
print(f"Successfully scraped {df.shape[0]} fights and last fight card was {df.iloc[-1, :]['Events']} {df.iloc[-1, :]['Location']}")
print(df["Label"].value_counts()/2265)

return df

def preprocessing(df):
# replace dong hyun kim (2) with Dong Hyun Ma
# fighter replacement corner
df = df.replace("Dong Hyun Kim (2)", "Dong Hyun Ma")
return df

# functions to compute deltas

def odds_delta(df):
if df["Favourite"] == "f1":
return df["fighter1_odds"] - df["fighter2_odds"]
else:
return df["fighter2_odds"] - df["fighter1_odds"]

def reach_delta(df):
if df["Favourite"] == "f1":
return df["REACH_x"] - df["REACH_y"]
else:
return df["REACH_y"] - df["REACH_x"]

def slpm_delta(df):
if df["Favourite"] == "f1":
return df["SLPM_x"] - df["SLPM_y"]
else:
return df["SLPM_y"] - df["SLPM_x"]

def sapm_delta(df):
if df["Favourite"] == "f1":
return df["SAPM_x"] - df["SAPM_y"]
else:
return df["SAPM_y"] - df["SAPM_x"]

def stra_delta(df):
if df["Favourite"] == "f1":
return df["STRA_x"] - df["STRA_y"]
else:
return df["STRA_y"] - df["STRA_x"]

def strd_delta(df):
if df["Favourite"] == "f1":
return df["STRD_x"] - df["STRD_y"]
else:
return df["STRD_y"] - df["STRD_x"]

def td_delta(df):
if df["Favourite"] == "f1":
return df["TD_x"] - df["TD_y"]
else:
return df["TD_y"] - df["TD_x"]

def tda_delta(df):
if df["Favourite"] == "f1":
return df["TDA_x"] - df["TDA_y"]
else:
return df["TDA_y"] - df["TDA_x"]

def tdd_delta(df):
if df["Favourite"] == "f1":
return df["TDD_x"] - df["TDD_y"]
else:
return df["TDD_y"] - df["TDD_x"]

def suba_delta(df):
if df["Favourite"] == "f1":
return df["SUBA_x"] - df["SUBA_y"]
else:
return df["SUBA_y"] - df["SUBA_x"]


def merge_data(df):

# We're always asking for json because it's the easiest to deal with
morph_api_url = "https://api.morph.io/jasonchanhku/ufc_fighters_db/data.json"

# Keep this key secret!
morph_api_key = "mF/o1gYK/7iCHIu5h5Sw"

r = requests.get(morph_api_url, params={
'key': morph_api_key,
'query': "select * from data"
})

j = r.json()

# fighters db dataset to me merged
fighters_db = pd.DataFrame.from_dict(j)

test = pd.merge(df, fighters_db, left_on=["Fighter1"], right_on=["NAME"])
test2 = pd.merge(test, fighters_db, left_on=["Fighter2"], right_on=["NAME"])

test2["Odds_delta"] = test2.apply(odds_delta, axis=1)
test2["REACH_delta"] = test2.apply(reach_delta, axis=1)
test2["SLPM_delta"] = test2.apply(slpm_delta, axis=1)
test2["SAPM_delta"] = test2.apply(sapm_delta, axis=1)
test2["STRA_delta"] = test2.apply(stra_delta, axis=1)
test2["STRD_delta"] = test2.apply(strd_delta, axis=1)
test2["TD_delta"] = test2.apply(td_delta, axis=1)
test2["TDA_delta"] = test2.apply(tda_delta, axis=1)
test2["TDD_delta"] = test2.apply(tdd_delta, axis=1)
test2["SUBA_delta"] = test2.apply(suba_delta, axis=1)

final_df = test2[['Events', 'Location', 'Fighter1', 'Fighter2', 'Favourite', 'Label', 'REACH_delta', 'SLPM_delta', 'SAPM_delta', 'STRA_delta', 'STRD_delta', 'TD_delta', 'TDA_delta', 'TDD_delta', 'SUBA_delta', 'Odds_delta']]

return final_df

scrape_data()
df = create_df()
df = preprocessing(df)
df = merge_data(df)

conn = sqlite3.connect('data.sqlite')
df.to_sql('data', conn, if_exists='replace')
print('Fights Merged Db successfully constructed and saved')
conn.close()

0 comments on commit da4c3ab

Please sign in to comment.