<a href="https://colab.research.google.com/github/halaaab/IT312/blob/main/DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Data collection started: {datetime.now()}")

Libraries imported successfully!
Data collection started: 2026-01-31 12:43:44.751047


In [2]:
# API Configuration
API_KEY = '928d65f77396adac26c83dd83294c39e'  # Replace with your actual API key from api-football.com
BASE_URL = 'https://v3.football.api-sports.io'

headers = {
    'x-rapidapi-host': 'v3.football.api-sports.io',
    'x-rapidapi-key': API_KEY
}

# Define leagues to collect data from (League IDs for API-Football)
# 39 = Premier League, 140 = La Liga, 135 = Serie A, 78 = Bundesliga, 61 = Ligue 1
LEAGUE_IDS = {
    39: 'Premier League',
    140: 'La Liga',
    135: 'Serie A',
    78: 'Bundesliga',
    61: 'Ligue 1'
}

print("API configuration complete!")
print(f"Leagues to collect: {list(LEAGUE_IDS.values())}")

API configuration complete!
Leagues to collect: ['Premier League', 'La Liga', 'Serie A', 'Bundesliga', 'Ligue 1']


In [3]:
def get_league_matches(league_id, season='2024'):
    """
    Fetch match data for a specific league and season from API-Football
    """
    url = f'{BASE_URL}/fixtures'
    params = {
        'league': league_id,
        'season': season
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()

        # API-Football returns data in 'response' field
        matches_count = len(data.get('response', []))
        print(f"✓ Successfully fetched {matches_count} matches from {LEAGUE_IDS[league_id]}")

        # Check API quota
        if 'requests' in data:
            print(f"  API usage: {data['requests']['current']}/{data['requests']['limit_day']} requests today")

        return data
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching data for league {league_id}: {e}")
        return None

def get_team_details(team_id):
    """
    Fetch detailed information about a specific team
    """
    url = f'{BASE_URL}/teams'
    params = {'id': team_id}

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching team {team_id}: {e}")
        return None

print("Collection functions defined!")

Collection functions defined!


In [4]:
# Collect match data from all leagues
all_matches_data = {}

for league_id in LEAGUE_IDS.keys():
    print(f"\nFetching data for {LEAGUE_IDS[league_id]}...")
    matches_data = get_league_matches(league_id, season='2024')

    if matches_data and matches_data.get('response'):
        all_matches_data[league_id] = matches_data

    # Respect API rate limit - small delay between requests
    time.sleep(2)

print(f"\n{'='*60}")
print(f"Data collection complete!")
print(f"Total leagues collected: {len(all_matches_data)}")
print(f"{'='*60}")


Fetching data for Premier League...
✓ Successfully fetched 380 matches from Premier League

Fetching data for La Liga...
✓ Successfully fetched 380 matches from La Liga

Fetching data for Serie A...
✓ Successfully fetched 380 matches from Serie A

Fetching data for Bundesliga...
✓ Successfully fetched 308 matches from Bundesliga

Fetching data for Ligue 1...
✓ Successfully fetched 308 matches from Ligue 1

Data collection complete!
Total leagues collected: 5


In [5]:
# Process and structure the match data
processed_matches = []

for league_id, league_data in all_matches_data.items():
    for match in league_data.get('response', []):
        # Extract data from API-Football nested structure
        fixture = match.get('fixture', {})
        league = match.get('league', {})
        teams = match.get('teams', {})
        goals = match.get('goals', {})
        score = match.get('score', {})

        # Determine winner
        if teams.get('home', {}).get('winner') is True:
            winner = 'HOME_TEAM'
        elif teams.get('away', {}).get('winner') is True:
            winner = 'AWAY_TEAM'
        else:
            winner = 'DRAW'

        match_info = {
            'match_id': fixture.get('id'),
            'league': LEAGUE_IDS[league_id],
            'league_id': league_id,
            'season': league.get('season'),
            'round': league.get('round'),
            'date': fixture.get('date'),
            'status': fixture.get('status', {}).get('long'),
            'home_team_id': teams.get('home', {}).get('id'),
            'home_team': teams.get('home', {}).get('name'),
            'away_team_id': teams.get('away', {}).get('id'),
            'away_team': teams.get('away', {}).get('name'),
            'home_score': goals.get('home'),
            'away_score': goals.get('away'),
            'home_halftime_score': score.get('halftime', {}).get('home'),
            'away_halftime_score': score.get('halftime', {}).get('away'),
            'winner': winner,
            'venue': fixture.get('venue', {}).get('name'),
            'venue_city': fixture.get('venue', {}).get('city'),
            'referee': fixture.get('referee')
        }
        processed_matches.append(match_info)

# Create DataFrame
matches_df = pd.DataFrame(processed_matches)
print(f"\nProcessed {len(matches_df)} total matches")
print(f"\nFeatures ({len(matches_df.columns)}): {list(matches_df.columns)}")
print(f"\nSample data:")
matches_df.head()


Processed 1756 total matches

Features (19): ['match_id', 'league', 'league_id', 'season', 'round', 'date', 'status', 'home_team_id', 'home_team', 'away_team_id', 'away_team', 'home_score', 'away_score', 'home_halftime_score', 'away_halftime_score', 'winner', 'venue', 'venue_city', 'referee']

Sample data:


Unnamed: 0,match_id,league,league_id,season,round,date,status,home_team_id,home_team,away_team_id,away_team,home_score,away_score,home_halftime_score,away_halftime_score,winner,venue,venue_city,referee
0,1208021,Premier League,39,2024,Regular Season - 1,2024-08-16T19:00:00+00:00,Match Finished,33,Manchester United,36,Fulham,1,0,0,0,HOME_TEAM,Old Trafford,Manchester,R. Jones
1,1208022,Premier League,39,2024,Regular Season - 1,2024-08-17T11:30:00+00:00,Match Finished,57,Ipswich,40,Liverpool,0,2,0,0,AWAY_TEAM,Portman Road,"Ipswich, Suffolk",T. Robinson
2,1208025,Premier League,39,2024,Regular Season - 1,2024-08-17T14:00:00+00:00,Match Finished,34,Newcastle,41,Southampton,1,0,1,0,HOME_TEAM,St. James' Park,Newcastle upon Tyne,C. Pawson
3,1208023,Premier League,39,2024,Regular Season - 1,2024-08-17T14:00:00+00:00,Match Finished,42,Arsenal,39,Wolves,2,0,1,0,HOME_TEAM,Emirates Stadium,London,J. Gillett
4,1208024,Premier League,39,2024,Regular Season - 1,2024-08-17T14:00:00+00:00,Match Finished,45,Everton,51,Brighton,0,3,0,1,AWAY_TEAM,Goodison Park,Liverpool,S. Hooper


In [7]:
def scrape_bbc_football_news():
    """
    Scrape recent football news and match reports from BBC Sport
    Returns unstructured text data
    """
    url = 'https://www.bbc.com/sport/football'

    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = []

        # Look for article containers
        article_containers = soup.find_all(['article', 'div'], class_=lambda x: x and ('promo' in x.lower() or 'article' in x.lower()))

        for container in article_containers[:20]:
            title_elem = container.find(['h3', 'h2', 'h1'])
            summary_elem = container.find('p')
            link_elem = container.find('a', href=True)

            if title_elem:
                article = {
                    'title': title_elem.get_text(strip=True),
                    'summary': summary_elem.get_text(strip=True) if summary_elem else '',
                    'link': link_elem['href'] if link_elem else '',
                    'source': 'BBC Sport',
                    'scraped_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                articles.append(article)

        print(f"✓ Scraped {len(articles)} articles from BBC Sport")
        return articles

    except Exception as e:
        print(f"✗ Error scraping BBC Sport: {e}")
        return []

In [8]:
def scrape_espn_match_reports():
    """
    Scrape match reports from ESPN Soccer section
    """
    url = 'https://www.espn.com/soccer/'

    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = []

        # Find all headlines and content
        headlines = soup.find_all(['h1', 'h2', 'h3'], class_=lambda x: x and 'headline' in str(x).lower())

        for headline in headlines[:15]:
            article_text = headline.get_text(strip=True)
            if article_text and len(article_text) > 10:
                articles.append({
                    'text': article_text,
                    'source': 'ESPN',
                    'scraped_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })

        print(f"✓ Scraped {len(articles)} headlines from ESPN")
        return articles

    except Exception as e:
        print(f"✗ Error scraping ESPN: {e}")
        return []

In [9]:
# Collect unstructured data from multiple sources
print("Collecting unstructured data (news articles and match reports)...\n")

bbc_articles = scrape_bbc_football_news()
time.sleep(2)  # Be respectful to servers

espn_articles = scrape_espn_match_reports()

# Combine all unstructured data
all_articles = bbc_articles + espn_articles

# Create DataFrame for unstructured data
if all_articles:
    articles_df = pd.DataFrame(all_articles)
    print(f"\nTotal articles collected: {len(articles_df)}")
    print(f"\nSample articles:")
    display(articles_df.head())
else:
    print("\nNo articles collected. Creating sample data for demonstration...")

    # Create sample data
    sample_articles = [
        {
            'title': 'Manchester City dominate possession in 3-1 victory',
            'summary': 'City controlled the game with 68% possession and clinical finishing.',
            'source': 'Sample',
            'scraped_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        },
        {
            'title': 'Liverpool come from behind to secure late win',
            'summary': 'Two goals in the final 10 minutes seal dramatic comeback.',
            'source': 'Sample',
            'scraped_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
    ]
    articles_df = pd.DataFrame(sample_articles)
    print(f"Created {len(articles_df)} sample articles")

Collecting unstructured data (news articles and match reports)...

✓ Scraped 0 articles from BBC Sport
✓ Scraped 2 headlines from ESPN

Total articles collected: 2

Sample articles:


Unnamed: 0,text,source,scraped_date
0,Customize ESPN,ESPN,2026-01-31 12:46:56
1,Customize ESPN,ESPN,2026-01-31 12:46:56


In [10]:
# Dataset statistics
print("=" * 60)
print("STRUCTURED DATA SUMMARY (Match Results from API-Football)")
print("=" * 60)
print(f"Total observations: {len(matches_df)}")
print(f"\nFeatures ({len(matches_df.columns)}):")
for col in matches_df.columns:
    dtype = matches_df[col].dtype
    null_count = matches_df[col].isnull().sum()
    null_pct = (null_count / len(matches_df) * 100) if len(matches_df) > 0 else 0
    print(f"  - {col}: {dtype} ({null_count} missing, {null_pct:.1f}%)")

if 'league' in matches_df.columns:
    print(f"\nLeagues covered:")
    print(matches_df['league'].value_counts())

if 'date' in matches_df.columns and not matches_df['date'].empty:
    print(f"\nDate range: {matches_df['date'].min()} to {matches_df['date'].max()}")

# Check match outcomes
if 'winner' in matches_df.columns:
    print(f"\nMatch outcomes:")
    print(matches_df['winner'].value_counts())

STRUCTURED DATA SUMMARY (Match Results from API-Football)
Total observations: 1756

Features (19):
  - match_id: int64 (0 missing, 0.0%)
  - league: object (0 missing, 0.0%)
  - league_id: int64 (0 missing, 0.0%)
  - season: int64 (0 missing, 0.0%)
  - round: object (0 missing, 0.0%)
  - date: object (0 missing, 0.0%)
  - status: object (0 missing, 0.0%)
  - home_team_id: int64 (0 missing, 0.0%)
  - home_team: object (0 missing, 0.0%)
  - away_team_id: int64 (0 missing, 0.0%)
  - away_team: object (0 missing, 0.0%)
  - home_score: int64 (0 missing, 0.0%)
  - away_score: int64 (0 missing, 0.0%)
  - home_halftime_score: int64 (0 missing, 0.0%)
  - away_halftime_score: int64 (0 missing, 0.0%)
  - winner: object (0 missing, 0.0%)
  - venue: object (0 missing, 0.0%)
  - venue_city: object (0 missing, 0.0%)
  - referee: object (0 missing, 0.0%)

Leagues covered:
league
Premier League    380
La Liga           380
Serie A           380
Bundesliga        308
Ligue 1           308
Name: count, d

In [11]:
print("\n" + "=" * 60)
print("UNSTRUCTURED DATA SUMMARY (News & Match Reports)")
print("=" * 60)
print(f"Total articles: {len(articles_df)}")
print(f"\nFeatures ({len(articles_df.columns)}): {list(articles_df.columns)}")

if 'source' in articles_df.columns:
    print(f"\nArticles by source:")
    print(articles_df['source'].value_counts())

# Sample of unstructured text
print("\nSample articles:")
for idx, row in articles_df.head(3).iterrows():
    title = row.get('title', row.get('text', '')[:100])
    print(f"\n{idx + 1}. {title}")
    if 'summary' in row and row['summary']:
        print(f"   {row['summary'][:150]}...")


UNSTRUCTURED DATA SUMMARY (News & Match Reports)
Total articles: 2

Features (3): ['text', 'source', 'scraped_date']

Articles by source:
source
ESPN    2
Name: count, dtype: int64

Sample articles:

1. Customize ESPN

2. Customize ESPN


In [12]:
# Save structured data
matches_df.to_csv('raw_match_data.csv', index=False)
print("✓ Saved structured match data to 'raw_match_data.csv'")

# Save unstructured data
articles_df.to_csv('raw_articles_data.csv', index=False)
print("✓ Saved unstructured article data to 'raw_articles_data.csv'")

# Save raw JSON from API (preserves full structure)
with open('raw_api_response.json', 'w') as f:
    json.dump(all_matches_data, f, indent=2)
print("✓ Saved raw API response to 'raw_api_response.json'")

# Save collection metadata
metadata = {
    'collection_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'structured_data': {
        'source': 'API-Football (api-football.com)',
        'api_version': 'v3',
        'observations': len(matches_df),
        'features': list(matches_df.columns),
        'leagues': list(LEAGUE_IDS.values()),
        'league_ids': list(LEAGUE_IDS.keys())
    },
    'unstructured_data': {
        'sources': ['BBC Sport', 'ESPN'],
        'articles': len(articles_df),
        'collection_method': 'Web scraping (BeautifulSoup)'
    },
    'api_info': {
        'free_tier_limit': '100 requests/day',
        'registration_url': 'https://dashboard.api-football.com/register'
    }
}

with open('collection_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("✓ Saved collection metadata to 'collection_metadata.json'")

print("\n" + "=" * 60)
print("DATA COLLECTION COMPLETE!")
print("=" * 60)
print(f"Files created:")
print("  1. raw_match_data.csv - Structured match results")
print("  2. raw_articles_data.csv - Unstructured news articles")
print("  3. raw_api_response.json - Complete API response")
print("  4. collection_metadata.json - Collection details")

✓ Saved structured match data to 'raw_match_data.csv'
✓ Saved unstructured article data to 'raw_articles_data.csv'
✓ Saved raw API response to 'raw_api_response.json'
✓ Saved collection metadata to 'collection_metadata.json'

DATA COLLECTION COMPLETE!
Files created:
  1. raw_match_data.csv - Structured match results
  2. raw_articles_data.csv - Unstructured news articles
  3. raw_api_response.json - Complete API response
  4. collection_metadata.json - Collection details
