From da4c3ab574ea25b8a5e2f0558f6b766141a7ffdb Mon Sep 17 00:00:00 2001 From: Jason Chan Date: Fri, 15 Feb 2019 13:34:55 +0000 Subject: [PATCH] completed scraper --- .ipynb_checkpoints/Untitled-checkpoint.ipynb | 611 ------------------ .../requirements-checkpoint.txt | 7 + .ipynb_checkpoints/runtime-checkpoint.txt | 1 + .ipynb_checkpoints/scraper-checkpoint.py | 222 +++++++ Untitled.ipynb | 611 ------------------ requirements.txt | 10 +- runtime.txt | 2 +- scraper.py | 246 ++++++- 8 files changed, 457 insertions(+), 1253 deletions(-) delete mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb create mode 100644 .ipynb_checkpoints/requirements-checkpoint.txt create mode 100644 .ipynb_checkpoints/runtime-checkpoint.txt create mode 100644 .ipynb_checkpoints/scraper-checkpoint.py delete mode 100644 Untitled.ipynb diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 8bba2b5..0000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,611 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 255, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import pandas as pd\n", - "import numpy as np\n", - "import string\n", - "import re\n", - "import datetime\n", - "import sqlite3\n", - "import time\n", - "\n", - "all_links = []\n", - "location = []\n", - "events = []\n", - "f1 = []\n", - "f2 = []\n", - "winner = []\n", - "f1_odds = []\n", - "f2_odds = []\n", - "label = []\n", - "favourite = []" - ] - }, - { - "cell_type": "code", - "execution_count": 256, - "metadata": {}, - "outputs": [], - "source": [ - "# set up page to extract table\n", - "data = requests.get(\"https://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1\")\n", - "soup = BeautifulSoup(data.text, 'html.parser')" - ] - }, - { - "cell_type": "code", - "execution_count": 257, - "metadata": {}, - "outputs": [], - "source": [ - "# table with 98% width \n", - "table = soup.find('table', {'width': \"98%\"})\n", - "# find all links in that table\n", - "links = table.find_all('a', href=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "metadata": {}, - "outputs": [], - "source": [ - "# append all links to a list \n", - "for link in links:\n", - " all_links.append(\"https://www.betmma.tips/\"+link.get('href'))" - ] - }, - { - "cell_type": "code", - "execution_count": 259, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=173\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=151\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=150\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=144\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=143\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=142\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=114\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=113\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=112\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=110\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=109\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=107\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=106\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=105\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=97\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=95\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=94\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=93\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=92\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=88\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=87\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=85\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=81\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=79\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=77\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=75\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=73\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=72\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=71\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=1\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=2\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=3\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=5\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=6\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=7\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=9\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=10\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=11\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=12\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=13\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=14\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=15\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=16\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=17\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=18\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=19\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=20\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=21\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=22\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=23\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=24\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=25\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=26\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=27\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=28\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=29\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=30\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=31\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=32\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=33\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=34\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=35\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=36\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=37\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=38\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=39\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=42\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=40\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=44\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=45\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=46\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=47\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=48\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=49\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=55\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=54\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=50\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=51\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=52\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=61\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=66\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=100\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=101\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=102\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=119\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=120\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=124\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=123\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=126\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=165\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=121\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=170\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=166\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=167\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=176\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=125\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=171\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=180\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=177\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=181\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=190\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=184\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=191\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=186\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=185\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=194\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=192\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=196\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=197\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=199\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=203\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=204\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=212\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=205\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=206\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=209\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=207\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=210\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=208\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=217\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=220\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=221\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=219\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=226\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=223\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=224\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=225\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=231\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=235\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=234\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=233\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=237\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=236\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=241\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=243\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=240\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=251\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=246\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=250\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=252\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=263\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=264\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=273\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=274\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=275\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=280\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=279\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=285\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=286\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=295\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=303\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=302\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=299\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=294\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=304\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=305\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=306\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=312\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=311\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=320\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=319\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=321\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=322\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=323\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=326\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=328\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=332\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=343\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=337\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=329\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=360\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=361\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=327\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=359\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=352\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=358\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=344\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=381\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=380\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=382\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=383\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=391\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=393\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=392\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=394\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=408\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=409\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=403\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=407\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=406\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=423\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=399\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=433\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=434\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=435\n" - ] - } - ], - "source": [ - "# test for one use case\n", - "for link in all_links:\n", - " print(f\"Now currently scraping link: {link}\")\n", - " \n", - " data = requests.get(link)\n", - " soup = BeautifulSoup(data.text, 'html.parser')\n", - " \n", - " # specific table with the information\n", - " rows = soup.find_all('table', {'cellspacing': \"5\"})\n", - " \n", - " for row in rows:\n", - " \n", - " # check for draw, if draw, then skip\n", - " # dictionary of won and lost\n", - " odds = row.find_all('td', {'align': \"center\", 'valign': \"middle\"})\n", - " if odds[0].text not in ['WON', 'LOST']:\n", - " continue\n", - " \n", - " # event name\n", - " h1 = soup.find(\"h1\")\n", - " # location and date\n", - " h2 = soup.find(\"h2\")\n", - "\n", - " events.append(h1.text)\n", - " location.append(h2.text)\n", - " \n", - " odds_f1 = float(odds[2].text.strip(\" @\"))\n", - " odds_f2 = float(odds[3].text.strip(\" @\"))\n", - " \n", - " f1_odds.append(odds_f1)\n", - " f2_odds.append(odds_f2)\n", - " \n", - " # how to generate label\n", - " odds_dict = {}\n", - " odds_dict[odds[0].text] = odds_f1\n", - " odds_dict[odds[1].text] = odds_f2 \n", - " \n", - " if odds_dict[\"WON\"] > odds_dict[\"LOST\"]:\n", - " label.append(\"Underdog\")\n", - " else:\n", - " label.append(\"Favourite\")\n", - " \n", - " if odds_f1 > odds_f2:\n", - " favourite.append(\"f2\")\n", - " else:\n", - " favourite.append(\"f1\")\n", - " \n", - " \n", - " fighters = row.find_all('a', attrs={'href': re.compile(\"^fighter_profile.php\")})\n", - " f1.append(fighters[0].text)\n", - " f2.append(fighters[1].text)\n", - " winner.append(fighters[2].text)\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 252, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Events list: ['UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2']\n", - "Location list: ['Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013']\n", - "f1 list: ['Jon Jones', 'Michael Bisping', 'Roy Nelson', 'Phil Davis', 'Pat Healy', 'Rustam Khabilov', 'Ovince St Preux', 'Sara McMann', 'Bryan Caraway', 'Cody McKenzie', 'Steven Siler', 'Vitor Belfort', 'Ronaldo Souza', 'Rafael dos Anjos', 'Rafael Natal', 'Nik Lentz', 'Francisco Trinaldo', 'Gleison Tibau', 'Paulo Thiago', 'Yuri Alcantara', 'Fabio Maldonado', 'John Lineker', 'Jussier Formiga', 'Lucas Martins', 'Cain Velasquez', 'Junior dos Santos', 'Glover Teixeira', 'T.J. Grant', 'Donald Cerrone', 'Mike Pyle', 'Dennis Bermudez', 'Robert Whittaker', 'Khabib Nurmagomedov', 'Stephen Thompson', 'George Roop', 'Jeremy Stephens']\n", - "f1 odds list: [1.13, 1.57, 1.43, 1.36, 3.4, 1.54, 1.69, 1.18, 2.15, 2.93, 1.95, 1.95, 1.17, 1.5, 1.36, 1.99, 1.33, 1.42, 1.57, 1.2, 1.38, 2.1, 1.65, 1.33, 1.16, 1.25, 1.33, 2.65, 1.37, 2.5, 1.4, 2.75, 1.4, 1.63, 3.66, 1.6]\n", - "f2 list: ['Chael Sonnen', 'Alan Belcher', 'Cheick Kongo', 'Vinny Magalhaes', 'Jim Miller', 'Yancy Medeiros', 'Gian Villante', 'Sheila Gaff', 'Johnny Bedford', 'Leonard Garcia', 'Kurt Holobaugh', 'Luke Rockhold', 'Chris Camozzi', 'Evan Dunham', 'Joao Zeferino', 'Hacran Dias', 'Mike Rio', 'John Cholish', 'Michel Prazeres', 'Iliarde Santos', 'Roger Hollett', 'Azamat Gashimov', 'Chris Cariaso', 'Jeremy Larsen', 'Antonio Silva', 'Mark Hunt', 'James Te Huna', 'Gray Maynard', 'K.J. Noons', 'Rick Story', 'Max Holloway', 'Colton Smith', 'Abel Trujillo', 'Nah-Shon Burrell', 'Brian Bowles', 'Estevan Payan']\n", - "f2 odds list: [9.0, 4.5, 3.2, 3.55, 1.4, 3.4, 2.4, 5.8, 1.91, 1.53, 2.06, 2.04, 6.0, 2.91, 3.7, 2.1, 3.9, 3.35, 2.76, 5.5, 3.45, 1.87, 2.55, 3.85, 6.75, 5.25, 4.75, 1.74, 3.48, 1.67, 3.65, 1.71, 3.75, 2.65, 1.36, 2.9]\n", - "winners list: ['Jon Jones', 'Michael Bisping', 'Roy Nelson', 'Phil Davis', 'Pat Healy', 'Rustam Khabilov', 'Ovince St Preux', 'Sara McMann', 'Bryan Caraway', 'Cody McKenzie', 'Steven Siler', 'Vitor Belfort', 'Ronaldo Souza', 'Rafael dos Anjos', 'Rafael Natal', 'Nik Lentz', 'Francisco Trinaldo', 'Gleison Tibau', 'Paulo Thiago', 'Yuri Alcantara', 'Fabio Maldonado', 'John Lineker', 'Jussier Formiga', 'Lucas Martins', 'Cain Velasquez', 'Junior dos Santos', 'Glover Teixeira', 'T.J. Grant', 'Donald Cerrone', 'Mike Pyle', 'Dennis Bermudez', 'Robert Whittaker', 'Khabib Nurmagomedov', 'Stephen Thompson', 'George Roop', 'Jeremy Stephens']\n", - "labels list: ['Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Underdog', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Underdog', 'Favourite']\n", - "favourite is: ['f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f1', 'f1', 'f2', 'f2', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f2', 'f1', 'f2', 'f1', 'f1', 'f2', 'f1']\n" - ] - } - ], - "source": [ - "print(f\"Events list: {events}\")\n", - "print(f\"Location list: {location}\")\n", - "print(f\"f1 list: {f1}\")\n", - "print(f\"f1 odds list: {f1_odds}\")\n", - "print(f\"f2 list: {f2}\")\n", - "print(f\"f2 odds list: {f2_odds}\")\n", - "print(f\"winners list: {winner}\")\n", - "print(f\"labels list: {label}\")\n", - "print(f\"favourite is: {favourite}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 260, - "metadata": {}, - "outputs": [], - "source": [ - "# creating dataframe\n", - "df = pd.DataFrame()\n", - "df[\"Events\"] = events\n", - "df[\"Location\"] = location\n", - "df[\"Fighter1\"] = f1\n", - "df[\"Fighter2\"] = f2\n", - "df[\"Winner\"] = winner\n", - "df[\"fighter1_odds\"] = f1_odds\n", - "df[\"fighter2_odds\"] = f2_odds\n", - "df[\"Favourite\"] = favourite\n", - "df[\"Label\"] = label" - ] - }, - { - "cell_type": "code", - "execution_count": 277, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully scraped 2265 fights and last fight card was UFC 234 Melbourne; 9th Feb 2019\n" - ] - } - ], - "source": [ - "print(f\"Successfully scraped {df.shape[0]} fights and last fight card was {df.iloc[-1, :]['Events']} {df.iloc[-1, :]['Location']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 282, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Favourite 0.654746\n", - "Underdog 0.345254\n", - "Name: Label, dtype: float64" - ] - }, - "execution_count": 282, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"Label\"].value_counts()/2265" - ] - }, - { - "cell_type": "code", - "execution_count": 286, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EventsLocationFighter1Fighter2Winnerfighter1_oddsfighter2_oddsFavouriteLabel
0UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Jon JonesChael SonnenJon Jones1.139.00f1Favourite
1UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Michael BispingAlan BelcherMichael Bisping1.574.50f1Favourite
2UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Roy NelsonCheick KongoRoy Nelson1.433.20f1Favourite
3UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Phil DavisVinny MagalhaesPhil Davis1.363.55f1Favourite
4UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Pat HealyJim MillerPat Healy3.401.40f2Underdog
\n", - "
" - ], - "text/plain": [ - " Events Location \\\n", - "0 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "1 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "2 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "3 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "4 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "\n", - " Fighter1 Fighter2 Winner fighter1_odds \\\n", - "0 Jon Jones Chael Sonnen Jon Jones 1.13 \n", - "1 Michael Bisping Alan Belcher Michael Bisping 1.57 \n", - "2 Roy Nelson Cheick Kongo Roy Nelson 1.43 \n", - "3 Phil Davis Vinny Magalhaes Phil Davis 1.36 \n", - "4 Pat Healy Jim Miller Pat Healy 3.40 \n", - "\n", - " fighter2_odds Favourite Label \n", - "0 9.00 f1 Favourite \n", - "1 4.50 f1 Favourite \n", - "2 3.20 f1 Favourite \n", - "3 3.55 f1 Favourite \n", - "4 1.40 f2 Underdog " - ] - }, - "execution_count": 286, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 288, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nTODO\\n\\n- merge df above with df with fighters information\\n- fuzzy match instead of exact match due to different sources\\n- compute the deltas\\n- save it in sqlite and push it to app\\n- separate ML from main app\\n\\n'" - ] - }, - "execution_count": 288, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - "TODO\n", - "\n", - "- merge df above with df with fighters information\n", - "- fuzzy match instead of exact match due to different sources\n", - "- compute the deltas\n", - "- save it in sqlite and push it to app\n", - "- separate ML from main app\n", - "\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/.ipynb_checkpoints/requirements-checkpoint.txt b/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 0000000..06541cf --- /dev/null +++ b/.ipynb_checkpoints/requirements-checkpoint.txt @@ -0,0 +1,7 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/python +beautifulsoup4==4.6.0 +pandas==0.23.0 +numpy==1.14.3 +requests==2.18.4 \ No newline at end of file diff --git a/.ipynb_checkpoints/runtime-checkpoint.txt b/.ipynb_checkpoints/runtime-checkpoint.txt new file mode 100644 index 0000000..cfa5aa5 --- /dev/null +++ b/.ipynb_checkpoints/runtime-checkpoint.txt @@ -0,0 +1 @@ +python-3.6.2 diff --git a/.ipynb_checkpoints/scraper-checkpoint.py b/.ipynb_checkpoints/scraper-checkpoint.py new file mode 100644 index 0000000..df1564f --- /dev/null +++ b/.ipynb_checkpoints/scraper-checkpoint.py @@ -0,0 +1,222 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +import string +import re +import datetime +import sqlite3 +import time + +all_links = [] +location = [] +events = [] +f1 = [] +f2 = [] +winner = [] +f1_odds = [] +f2_odds = [] +label = [] +favourite = [] + +def scrape_data(): + # set up page to extract table + data = requests.get("https://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1") + soup = BeautifulSoup(data.text, 'html.parser') + + # table with 98% width + table = soup.find('table', {'width': "98%"}) + # find all links in that table + links = table.find_all('a', href=True) + + # append all links to a list + for link in links: + all_links.append("https://www.betmma.tips/"+link.get('href')) + + # test for one use case + for link in all_links: + print(f"Now currently scraping link: {link}") + + data = requests.get(link) + soup = BeautifulSoup(data.text, 'html.parser') + time.sleep(2) + # specific table with the information + rows = soup.find_all('table', {'cellspacing': "5"}) + + for row in rows: + + # check for draw, if draw, then skip + # dictionary of won and lost + odds = row.find_all('td', {'align': "center", 'valign': "middle"}) + # to avoid taking in draws + if odds[0].text not in ['WON', 'LOST']: + continue + + # event name + h1 = soup.find("h1") + # location and date + h2 = soup.find("h2") + + events.append(h1.text) + location.append(h2.text) + + odds_f1 = float(odds[2].text.strip(" @")) + odds_f2 = float(odds[3].text.strip(" @")) + + f1_odds.append(odds_f1) + f2_odds.append(odds_f2) + + # how to generate label + odds_dict = {} + odds_dict[odds[0].text] = odds_f1 + odds_dict[odds[1].text] = odds_f2 + + if odds_dict["WON"] > odds_dict["LOST"]: + label.append("Underdog") + else: + label.append("Favourite") + + if odds_f1 > odds_f2: + favourite.append("f2") + else: + favourite.append("f1") + + + fighters = row.find_all('a', attrs={'href': re.compile("^fighter_profile.php")}) + f1.append(fighters[0].text) + f2.append(fighters[1].text) + winner.append(fighters[2].text) + return None + +def create_df(): + + # creating dataframe + df = pd.DataFrame() + df["Events"] = events + df["Location"] = location + df["Fighter1"] = f1 + df["Fighter2"] = f2 + df["Winner"] = winner + df["fighter1_odds"] = f1_odds + df["fighter2_odds"] = f2_odds + df["Favourite"] = favourite + df["Label"] = label + print(f"Successfully scraped {df.shape[0]} fights and last fight card was {df.iloc[-1, :]['Events']} {df.iloc[-1, :]['Location']}") + print(df["Label"].value_counts()/2265) + + return df + +def preprocessing(df): + # replace dong hyun kim (2) with Dong Hyun Ma + # fighter replacement corner + df = df.replace("Dong Hyun Kim (2)", "Dong Hyun Ma") + return df + +# functions to compute deltas + +def odds_delta(df): + if df["Favourite"] == "f1": + return df["fighter1_odds"] - df["fighter2_odds"] + else: + return df["fighter2_odds"] - df["fighter1_odds"] + +def reach_delta(df): + if df["Favourite"] == "f1": + return df["REACH_x"] - df["REACH_y"] + else: + return df["REACH_y"] - df["REACH_x"] + +def slpm_delta(df): + if df["Favourite"] == "f1": + return df["SLPM_x"] - df["SLPM_y"] + else: + return df["SLPM_y"] - df["SLPM_x"] + +def sapm_delta(df): + if df["Favourite"] == "f1": + return df["SAPM_x"] - df["SAPM_y"] + else: + return df["SAPM_y"] - df["SAPM_x"] + +def stra_delta(df): + if df["Favourite"] == "f1": + return df["STRA_x"] - df["STRA_y"] + else: + return df["STRA_y"] - df["STRA_x"] + +def strd_delta(df): + if df["Favourite"] == "f1": + return df["STRD_x"] - df["STRD_y"] + else: + return df["STRD_y"] - df["STRD_x"] + +def td_delta(df): + if df["Favourite"] == "f1": + return df["TD_x"] - df["TD_y"] + else: + return df["TD_y"] - df["TD_x"] + +def tda_delta(df): + if df["Favourite"] == "f1": + return df["TDA_x"] - df["TDA_y"] + else: + return df["TDA_y"] - df["TDA_x"] + +def tdd_delta(df): + if df["Favourite"] == "f1": + return df["TDD_x"] - df["TDD_y"] + else: + return df["TDD_y"] - df["TDD_x"] + +def suba_delta(df): + if df["Favourite"] == "f1": + return df["SUBA_x"] - df["SUBA_y"] + else: + return df["SUBA_y"] - df["SUBA_x"] + + +def merge_data(df): + + # We're always asking for json because it's the easiest to deal with + morph_api_url = "https://api.morph.io/jasonchanhku/ufc_fighters_db/data.json" + + # Keep this key secret! + morph_api_key = "mF/o1gYK/7iCHIu5h5Sw" + + r = requests.get(morph_api_url, params={ + 'key': morph_api_key, + 'query': "select * from data" + }) + + j = r.json() + + # fighters db dataset to me merged + fighters_db = pd.DataFrame.from_dict(j) + + test = pd.merge(df, fighters_db, left_on=["Fighter1"], right_on=["NAME"]) + test2 = pd.merge(test, fighters_db, left_on=["Fighter2"], right_on=["NAME"]) + + test2["Odds_delta"] = test2.apply(odds_delta, axis=1) + test2["REACH_delta"] = test2.apply(reach_delta, axis=1) + test2["SLPM_delta"] = test2.apply(slpm_delta, axis=1) + test2["SAPM_delta"] = test2.apply(sapm_delta, axis=1) + test2["STRA_delta"] = test2.apply(stra_delta, axis=1) + test2["STRD_delta"] = test2.apply(strd_delta, axis=1) + test2["TD_delta"] = test2.apply(td_delta, axis=1) + test2["TDA_delta"] = test2.apply(tda_delta, axis=1) + test2["TDD_delta"] = test2.apply(tdd_delta, axis=1) + test2["SUBA_delta"] = test2.apply(suba_delta, axis=1) + + final_df = test2[['Events', 'Location', 'Fighter1', 'Fighter2', 'Favourite', 'Label', 'REACH_delta', 'SLPM_delta', 'SAPM_delta', 'STRA_delta', 'STRD_delta', 'TD_delta', 'TDA_delta', 'TDD_delta', 'SUBA_delta', 'Odds_delta']] + + return final_df + +scrape_data() +df = create_df() +df = preprocessing(df) +df = merge_data(df) + +conn = sqlite3.connect('data.sqlite') +df.to_sql('data', conn, if_exists='replace') +print('Fights Merged Db successfully constructed and saved') +conn.close() diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index 8bba2b5..0000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,611 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 255, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import pandas as pd\n", - "import numpy as np\n", - "import string\n", - "import re\n", - "import datetime\n", - "import sqlite3\n", - "import time\n", - "\n", - "all_links = []\n", - "location = []\n", - "events = []\n", - "f1 = []\n", - "f2 = []\n", - "winner = []\n", - "f1_odds = []\n", - "f2_odds = []\n", - "label = []\n", - "favourite = []" - ] - }, - { - "cell_type": "code", - "execution_count": 256, - "metadata": {}, - "outputs": [], - "source": [ - "# set up page to extract table\n", - "data = requests.get(\"https://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1\")\n", - "soup = BeautifulSoup(data.text, 'html.parser')" - ] - }, - { - "cell_type": "code", - "execution_count": 257, - "metadata": {}, - "outputs": [], - "source": [ - "# table with 98% width \n", - "table = soup.find('table', {'width': \"98%\"})\n", - "# find all links in that table\n", - "links = table.find_all('a', href=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "metadata": {}, - "outputs": [], - "source": [ - "# append all links to a list \n", - "for link in links:\n", - " all_links.append(\"https://www.betmma.tips/\"+link.get('href'))" - ] - }, - { - "cell_type": "code", - "execution_count": 259, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=173\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=151\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=150\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=144\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=143\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=142\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=114\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=113\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=112\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=110\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=109\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=107\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=106\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=105\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=97\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=95\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=94\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=93\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=92\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=88\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=87\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=85\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=81\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=79\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=77\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=75\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=73\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=72\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=71\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=1\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=2\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=3\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=5\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=6\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=7\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=9\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=10\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=11\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=12\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=13\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=14\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=15\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=16\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=17\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=18\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=19\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=20\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=21\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=22\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=23\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=24\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=25\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=26\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=27\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=28\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=29\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=30\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=31\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=32\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=33\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=34\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=35\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=36\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=37\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=38\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=39\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=42\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=40\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=44\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=45\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=46\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=47\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=48\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=49\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=55\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=54\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=50\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=51\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=52\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=61\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=66\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=100\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=101\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=102\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=119\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=120\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=124\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=123\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=126\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=165\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=121\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=170\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=166\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=167\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=176\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=125\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=171\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=180\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=177\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=181\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=190\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=184\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=191\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=186\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=185\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=194\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=192\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=196\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=197\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=199\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=203\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=204\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=212\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=205\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=206\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=209\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=207\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=210\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=208\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=217\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=220\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=221\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=219\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=226\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=223\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=224\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=225\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=231\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=235\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=234\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=233\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=237\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=236\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=241\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=243\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=240\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=251\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=246\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=250\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=252\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=263\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=264\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=273\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=274\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=275\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=280\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=279\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=285\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=286\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=295\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=303\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=302\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=299\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=294\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=304\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=305\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=306\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=312\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=311\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=320\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=319\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=321\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=322\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=323\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=326\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=328\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=332\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=343\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=337\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=329\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=360\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=361\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=327\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=359\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=352\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=358\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=344\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=381\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=380\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=382\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=383\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=391\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=393\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=392\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=394\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=408\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=409\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=403\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=407\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=406\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=423\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=399\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=433\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=434\n", - "Now currently in link: https://www.betmma.tips/mma_event_betting_history.php?Event=435\n" - ] - } - ], - "source": [ - "# test for one use case\n", - "for link in all_links:\n", - " print(f\"Now currently scraping link: {link}\")\n", - " \n", - " data = requests.get(link)\n", - " soup = BeautifulSoup(data.text, 'html.parser')\n", - " \n", - " # specific table with the information\n", - " rows = soup.find_all('table', {'cellspacing': \"5\"})\n", - " \n", - " for row in rows:\n", - " \n", - " # check for draw, if draw, then skip\n", - " # dictionary of won and lost\n", - " odds = row.find_all('td', {'align': \"center\", 'valign': \"middle\"})\n", - " if odds[0].text not in ['WON', 'LOST']:\n", - " continue\n", - " \n", - " # event name\n", - " h1 = soup.find(\"h1\")\n", - " # location and date\n", - " h2 = soup.find(\"h2\")\n", - "\n", - " events.append(h1.text)\n", - " location.append(h2.text)\n", - " \n", - " odds_f1 = float(odds[2].text.strip(\" @\"))\n", - " odds_f2 = float(odds[3].text.strip(\" @\"))\n", - " \n", - " f1_odds.append(odds_f1)\n", - " f2_odds.append(odds_f2)\n", - " \n", - " # how to generate label\n", - " odds_dict = {}\n", - " odds_dict[odds[0].text] = odds_f1\n", - " odds_dict[odds[1].text] = odds_f2 \n", - " \n", - " if odds_dict[\"WON\"] > odds_dict[\"LOST\"]:\n", - " label.append(\"Underdog\")\n", - " else:\n", - " label.append(\"Favourite\")\n", - " \n", - " if odds_f1 > odds_f2:\n", - " favourite.append(\"f2\")\n", - " else:\n", - " favourite.append(\"f1\")\n", - " \n", - " \n", - " fighters = row.find_all('a', attrs={'href': re.compile(\"^fighter_profile.php\")})\n", - " f1.append(fighters[0].text)\n", - " f2.append(fighters[1].text)\n", - " winner.append(fighters[2].text)\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 252, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Events list: ['UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC 159 - Jones vs. Sonnen', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC on FX 8 - Belfort vs. Rockhold', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2', 'UFC 160 - Velasquez vs. Bigfoot 2']\n", - "Location list: ['Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Newark, New Jersey; 27th Apr 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Santa Catarina, Brazil; 18th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013', 'Las Vegas, Nevada; 25th May 2013']\n", - "f1 list: ['Jon Jones', 'Michael Bisping', 'Roy Nelson', 'Phil Davis', 'Pat Healy', 'Rustam Khabilov', 'Ovince St Preux', 'Sara McMann', 'Bryan Caraway', 'Cody McKenzie', 'Steven Siler', 'Vitor Belfort', 'Ronaldo Souza', 'Rafael dos Anjos', 'Rafael Natal', 'Nik Lentz', 'Francisco Trinaldo', 'Gleison Tibau', 'Paulo Thiago', 'Yuri Alcantara', 'Fabio Maldonado', 'John Lineker', 'Jussier Formiga', 'Lucas Martins', 'Cain Velasquez', 'Junior dos Santos', 'Glover Teixeira', 'T.J. Grant', 'Donald Cerrone', 'Mike Pyle', 'Dennis Bermudez', 'Robert Whittaker', 'Khabib Nurmagomedov', 'Stephen Thompson', 'George Roop', 'Jeremy Stephens']\n", - "f1 odds list: [1.13, 1.57, 1.43, 1.36, 3.4, 1.54, 1.69, 1.18, 2.15, 2.93, 1.95, 1.95, 1.17, 1.5, 1.36, 1.99, 1.33, 1.42, 1.57, 1.2, 1.38, 2.1, 1.65, 1.33, 1.16, 1.25, 1.33, 2.65, 1.37, 2.5, 1.4, 2.75, 1.4, 1.63, 3.66, 1.6]\n", - "f2 list: ['Chael Sonnen', 'Alan Belcher', 'Cheick Kongo', 'Vinny Magalhaes', 'Jim Miller', 'Yancy Medeiros', 'Gian Villante', 'Sheila Gaff', 'Johnny Bedford', 'Leonard Garcia', 'Kurt Holobaugh', 'Luke Rockhold', 'Chris Camozzi', 'Evan Dunham', 'Joao Zeferino', 'Hacran Dias', 'Mike Rio', 'John Cholish', 'Michel Prazeres', 'Iliarde Santos', 'Roger Hollett', 'Azamat Gashimov', 'Chris Cariaso', 'Jeremy Larsen', 'Antonio Silva', 'Mark Hunt', 'James Te Huna', 'Gray Maynard', 'K.J. Noons', 'Rick Story', 'Max Holloway', 'Colton Smith', 'Abel Trujillo', 'Nah-Shon Burrell', 'Brian Bowles', 'Estevan Payan']\n", - "f2 odds list: [9.0, 4.5, 3.2, 3.55, 1.4, 3.4, 2.4, 5.8, 1.91, 1.53, 2.06, 2.04, 6.0, 2.91, 3.7, 2.1, 3.9, 3.35, 2.76, 5.5, 3.45, 1.87, 2.55, 3.85, 6.75, 5.25, 4.75, 1.74, 3.48, 1.67, 3.65, 1.71, 3.75, 2.65, 1.36, 2.9]\n", - "winners list: ['Jon Jones', 'Michael Bisping', 'Roy Nelson', 'Phil Davis', 'Pat Healy', 'Rustam Khabilov', 'Ovince St Preux', 'Sara McMann', 'Bryan Caraway', 'Cody McKenzie', 'Steven Siler', 'Vitor Belfort', 'Ronaldo Souza', 'Rafael dos Anjos', 'Rafael Natal', 'Nik Lentz', 'Francisco Trinaldo', 'Gleison Tibau', 'Paulo Thiago', 'Yuri Alcantara', 'Fabio Maldonado', 'John Lineker', 'Jussier Formiga', 'Lucas Martins', 'Cain Velasquez', 'Junior dos Santos', 'Glover Teixeira', 'T.J. Grant', 'Donald Cerrone', 'Mike Pyle', 'Dennis Bermudez', 'Robert Whittaker', 'Khabib Nurmagomedov', 'Stephen Thompson', 'George Roop', 'Jeremy Stephens']\n", - "labels list: ['Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Favourite', 'Underdog', 'Favourite', 'Underdog', 'Favourite', 'Underdog', 'Favourite', 'Favourite', 'Underdog', 'Favourite']\n", - "favourite is: ['f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f1', 'f1', 'f2', 'f2', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f1', 'f1', 'f1', 'f1', 'f2', 'f1', 'f2', 'f1', 'f2', 'f1', 'f1', 'f2', 'f1']\n" - ] - } - ], - "source": [ - "print(f\"Events list: {events}\")\n", - "print(f\"Location list: {location}\")\n", - "print(f\"f1 list: {f1}\")\n", - "print(f\"f1 odds list: {f1_odds}\")\n", - "print(f\"f2 list: {f2}\")\n", - "print(f\"f2 odds list: {f2_odds}\")\n", - "print(f\"winners list: {winner}\")\n", - "print(f\"labels list: {label}\")\n", - "print(f\"favourite is: {favourite}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 260, - "metadata": {}, - "outputs": [], - "source": [ - "# creating dataframe\n", - "df = pd.DataFrame()\n", - "df[\"Events\"] = events\n", - "df[\"Location\"] = location\n", - "df[\"Fighter1\"] = f1\n", - "df[\"Fighter2\"] = f2\n", - "df[\"Winner\"] = winner\n", - "df[\"fighter1_odds\"] = f1_odds\n", - "df[\"fighter2_odds\"] = f2_odds\n", - "df[\"Favourite\"] = favourite\n", - "df[\"Label\"] = label" - ] - }, - { - "cell_type": "code", - "execution_count": 277, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Successfully scraped 2265 fights and last fight card was UFC 234 Melbourne; 9th Feb 2019\n" - ] - } - ], - "source": [ - "print(f\"Successfully scraped {df.shape[0]} fights and last fight card was {df.iloc[-1, :]['Events']} {df.iloc[-1, :]['Location']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 282, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Favourite 0.654746\n", - "Underdog 0.345254\n", - "Name: Label, dtype: float64" - ] - }, - "execution_count": 282, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"Label\"].value_counts()/2265" - ] - }, - { - "cell_type": "code", - "execution_count": 286, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EventsLocationFighter1Fighter2Winnerfighter1_oddsfighter2_oddsFavouriteLabel
0UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Jon JonesChael SonnenJon Jones1.139.00f1Favourite
1UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Michael BispingAlan BelcherMichael Bisping1.574.50f1Favourite
2UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Roy NelsonCheick KongoRoy Nelson1.433.20f1Favourite
3UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Phil DavisVinny MagalhaesPhil Davis1.363.55f1Favourite
4UFC 159 - Jones vs. SonnenNewark, New Jersey; 27th Apr 2013Pat HealyJim MillerPat Healy3.401.40f2Underdog
\n", - "
" - ], - "text/plain": [ - " Events Location \\\n", - "0 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "1 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "2 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "3 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "4 UFC 159 - Jones vs. Sonnen Newark, New Jersey; 27th Apr 2013 \n", - "\n", - " Fighter1 Fighter2 Winner fighter1_odds \\\n", - "0 Jon Jones Chael Sonnen Jon Jones 1.13 \n", - "1 Michael Bisping Alan Belcher Michael Bisping 1.57 \n", - "2 Roy Nelson Cheick Kongo Roy Nelson 1.43 \n", - "3 Phil Davis Vinny Magalhaes Phil Davis 1.36 \n", - "4 Pat Healy Jim Miller Pat Healy 3.40 \n", - "\n", - " fighter2_odds Favourite Label \n", - "0 9.00 f1 Favourite \n", - "1 4.50 f1 Favourite \n", - "2 3.20 f1 Favourite \n", - "3 3.55 f1 Favourite \n", - "4 1.40 f2 Underdog " - ] - }, - "execution_count": 286, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 288, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nTODO\\n\\n- merge df above with df with fighters information\\n- fuzzy match instead of exact match due to different sources\\n- compute the deltas\\n- save it in sqlite and push it to app\\n- separate ML from main app\\n\\n'" - ] - }, - "execution_count": 288, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - "TODO\n", - "\n", - "- merge df above with df with fighters information\n", - "- fuzzy match instead of exact match due to different sources\n", - "- compute the deltas\n", - "- save it in sqlite and push it to app\n", - "- separate ML from main app\n", - "\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements.txt b/requirements.txt index fce25cc..06541cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ # It's easy to add more libraries or choose different versions. Any libraries # specified here will be installed and made available to your morph.io scraper. # Find out more: https://morph.io/documentation/python - -# Custom version of scraperwiki library --e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki - -lxml==3.4.4 -cssselect==0.9.1 +beautifulsoup4==4.6.0 +pandas==0.23.0 +numpy==1.14.3 +requests==2.18.4 \ No newline at end of file diff --git a/runtime.txt b/runtime.txt index c47075b..cfa5aa5 100644 --- a/runtime.txt +++ b/runtime.txt @@ -1 +1 @@ -python-2.7.9 +python-3.6.2 diff --git a/scraper.py b/scraper.py index 69bea68..df1564f 100644 --- a/scraper.py +++ b/scraper.py @@ -1,24 +1,222 @@ -# This is a template for a Python scraper on morph.io (https://morph.io) -# including some code snippets below that you should find helpful - -# import scraperwiki -# import lxml.html -# -# # Read in a page -# html = scraperwiki.scrape("http://foo.com") -# -# # Find something on the page using css selectors -# root = lxml.html.fromstring(html) -# root.cssselect("div[align='left']") -# -# # Write out to the sqlite database using scraperwiki library -# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) -# -# # An arbitrary query against the database -# scraperwiki.sql.select("* from data where 'name'='peter'") - -# You don't have to do things with the ScraperWiki and lxml libraries. -# You can use whatever libraries you want: https://morph.io/documentation/python -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +import requests +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +import string +import re +import datetime +import sqlite3 +import time + +all_links = [] +location = [] +events = [] +f1 = [] +f2 = [] +winner = [] +f1_odds = [] +f2_odds = [] +label = [] +favourite = [] + +def scrape_data(): + # set up page to extract table + data = requests.get("https://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1") + soup = BeautifulSoup(data.text, 'html.parser') + + # table with 98% width + table = soup.find('table', {'width': "98%"}) + # find all links in that table + links = table.find_all('a', href=True) + + # append all links to a list + for link in links: + all_links.append("https://www.betmma.tips/"+link.get('href')) + + # test for one use case + for link in all_links: + print(f"Now currently scraping link: {link}") + + data = requests.get(link) + soup = BeautifulSoup(data.text, 'html.parser') + time.sleep(2) + # specific table with the information + rows = soup.find_all('table', {'cellspacing': "5"}) + + for row in rows: + + # check for draw, if draw, then skip + # dictionary of won and lost + odds = row.find_all('td', {'align': "center", 'valign': "middle"}) + # to avoid taking in draws + if odds[0].text not in ['WON', 'LOST']: + continue + + # event name + h1 = soup.find("h1") + # location and date + h2 = soup.find("h2") + + events.append(h1.text) + location.append(h2.text) + + odds_f1 = float(odds[2].text.strip(" @")) + odds_f2 = float(odds[3].text.strip(" @")) + + f1_odds.append(odds_f1) + f2_odds.append(odds_f2) + + # how to generate label + odds_dict = {} + odds_dict[odds[0].text] = odds_f1 + odds_dict[odds[1].text] = odds_f2 + + if odds_dict["WON"] > odds_dict["LOST"]: + label.append("Underdog") + else: + label.append("Favourite") + + if odds_f1 > odds_f2: + favourite.append("f2") + else: + favourite.append("f1") + + + fighters = row.find_all('a', attrs={'href': re.compile("^fighter_profile.php")}) + f1.append(fighters[0].text) + f2.append(fighters[1].text) + winner.append(fighters[2].text) + return None + +def create_df(): + + # creating dataframe + df = pd.DataFrame() + df["Events"] = events + df["Location"] = location + df["Fighter1"] = f1 + df["Fighter2"] = f2 + df["Winner"] = winner + df["fighter1_odds"] = f1_odds + df["fighter2_odds"] = f2_odds + df["Favourite"] = favourite + df["Label"] = label + print(f"Successfully scraped {df.shape[0]} fights and last fight card was {df.iloc[-1, :]['Events']} {df.iloc[-1, :]['Location']}") + print(df["Label"].value_counts()/2265) + + return df + +def preprocessing(df): + # replace dong hyun kim (2) with Dong Hyun Ma + # fighter replacement corner + df = df.replace("Dong Hyun Kim (2)", "Dong Hyun Ma") + return df + +# functions to compute deltas + +def odds_delta(df): + if df["Favourite"] == "f1": + return df["fighter1_odds"] - df["fighter2_odds"] + else: + return df["fighter2_odds"] - df["fighter1_odds"] + +def reach_delta(df): + if df["Favourite"] == "f1": + return df["REACH_x"] - df["REACH_y"] + else: + return df["REACH_y"] - df["REACH_x"] + +def slpm_delta(df): + if df["Favourite"] == "f1": + return df["SLPM_x"] - df["SLPM_y"] + else: + return df["SLPM_y"] - df["SLPM_x"] + +def sapm_delta(df): + if df["Favourite"] == "f1": + return df["SAPM_x"] - df["SAPM_y"] + else: + return df["SAPM_y"] - df["SAPM_x"] + +def stra_delta(df): + if df["Favourite"] == "f1": + return df["STRA_x"] - df["STRA_y"] + else: + return df["STRA_y"] - df["STRA_x"] + +def strd_delta(df): + if df["Favourite"] == "f1": + return df["STRD_x"] - df["STRD_y"] + else: + return df["STRD_y"] - df["STRD_x"] + +def td_delta(df): + if df["Favourite"] == "f1": + return df["TD_x"] - df["TD_y"] + else: + return df["TD_y"] - df["TD_x"] + +def tda_delta(df): + if df["Favourite"] == "f1": + return df["TDA_x"] - df["TDA_y"] + else: + return df["TDA_y"] - df["TDA_x"] + +def tdd_delta(df): + if df["Favourite"] == "f1": + return df["TDD_x"] - df["TDD_y"] + else: + return df["TDD_y"] - df["TDD_x"] + +def suba_delta(df): + if df["Favourite"] == "f1": + return df["SUBA_x"] - df["SUBA_y"] + else: + return df["SUBA_y"] - df["SUBA_x"] + + +def merge_data(df): + + # We're always asking for json because it's the easiest to deal with + morph_api_url = "https://api.morph.io/jasonchanhku/ufc_fighters_db/data.json" + + # Keep this key secret! + morph_api_key = "mF/o1gYK/7iCHIu5h5Sw" + + r = requests.get(morph_api_url, params={ + 'key': morph_api_key, + 'query': "select * from data" + }) + + j = r.json() + + # fighters db dataset to me merged + fighters_db = pd.DataFrame.from_dict(j) + + test = pd.merge(df, fighters_db, left_on=["Fighter1"], right_on=["NAME"]) + test2 = pd.merge(test, fighters_db, left_on=["Fighter2"], right_on=["NAME"]) + + test2["Odds_delta"] = test2.apply(odds_delta, axis=1) + test2["REACH_delta"] = test2.apply(reach_delta, axis=1) + test2["SLPM_delta"] = test2.apply(slpm_delta, axis=1) + test2["SAPM_delta"] = test2.apply(sapm_delta, axis=1) + test2["STRA_delta"] = test2.apply(stra_delta, axis=1) + test2["STRD_delta"] = test2.apply(strd_delta, axis=1) + test2["TD_delta"] = test2.apply(td_delta, axis=1) + test2["TDA_delta"] = test2.apply(tda_delta, axis=1) + test2["TDD_delta"] = test2.apply(tdd_delta, axis=1) + test2["SUBA_delta"] = test2.apply(suba_delta, axis=1) + + final_df = test2[['Events', 'Location', 'Fighter1', 'Fighter2', 'Favourite', 'Label', 'REACH_delta', 'SLPM_delta', 'SAPM_delta', 'STRA_delta', 'STRD_delta', 'TD_delta', 'TDA_delta', 'TDD_delta', 'SUBA_delta', 'Odds_delta']] + + return final_df + +scrape_data() +df = create_df() +df = preprocessing(df) +df = merge_data(df) + +conn = sqlite3.connect('data.sqlite') +df.to_sql('data', conn, if_exists='replace') +print('Fights Merged Db successfully constructed and saved') +conn.close()