In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2010.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2010_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/faqs-the-indian-premier-league-337868
Scraping: https://www.espncricinfo.com/story/four-new-venues-announced-for-ipl-s-third-season-419058
Scraping: https://www.espncricinfo.com/story/steyn-cleared-of-doping-violation-419202
Scraping: https://www.espncricinfo.com/story/wright-confirms-kolkata-s-interest-419763
Scraping: https://www.espncricinfo.com/story/public-reaction-to-doping-scare-was-hard-to-deal-with-steyn-420414
Scraping: https://www.espncricinfo.com/story/reeve-on-kolkata-knight-riders-coaching-shortlist-420590
Scraping: https://www.espncricinfo.com/story/deal-directly-with-players-ipl-franchises-told-420678
Scraping: https://www.espncricinfo.com/story/hayden-suggests-scrapping-champions-trophy-421026
Scraping: https://www.espncricinfo.com/story/vaughan-urges-careful-ipl-scheduling-421893
Scraping: https://www.espncricinfo.com/story/akram-among-candidates-for-kkr-coach-s-job-422085
Scraping: https://www.espncricinfo.com/story/battle

Scraping: https://www.espncricinfo.com/story/ijaz-butt-highly-disappointed-by-ipl-cold-shoulder-445201
Scraping: https://www.espncricinfo.com/story/ipl-matches-to-be-broadcast-live-on-youtube-445173
Scraping: https://www.espncricinfo.com/story/i-m-nowhere-near-test-cricket-eoin-morgan-445215
Scraping: https://www.espncricinfo.com/story/morgan-confirms-growing-reputation-616938
Scraping: https://www.espncricinfo.com/story/what-they-said-about-pakistan-s-exclusion-from-ipl-445328
Scraping: https://www.espncricinfo.com/story/challenge-is-to-sustain-what-i-ve-started-kieron-pollard-445271
Scraping: https://www.espncricinfo.com/story/ipl-s-pakistan-snub-was-avoidable-616952
Scraping: https://www.espncricinfo.com/story/ramiz-raja-ipl-could-have-handled-pakistan-players-better-445233
Scraping: https://www.espncricinfo.com/story/twenty20-a-virus-says-javed-miandad-445303
Scraping: https://www.espncricinfo.com/story/a-setback-to-indo-pak-ties-616946
Scraping: https://www.espncricinfo.com/story/

Scraping: https://www.espncricinfo.com/story/red-carpet-resemblance-617221
Scraping: https://www.espncricinfo.com/story/australian-players-told-ipl-threat-not-credible-450295
Scraping: https://www.espncricinfo.com/story/news-agencies-all-set-to-abandon-ipl-450496
Scraping: https://www.espncricinfo.com/story/heavy-fixture-list-threatens-ipl-warm-up-plans-450468
Scraping: https://www.espncricinfo.com/story/punjab-get-yuvraj-singh-fitness-boost-450460
Scraping: https://www.espncricinfo.com/story/ravi-bopara-delays-departure-to-india-for-ipl-450417
Scraping: https://www.espncricinfo.com/story/irfan-pathan-to-be-match-fit-for-most-of-ipl-450526
Scraping: https://www.espncricinfo.com/story/graham-napier-out-to-prove-selectors-wrong-450618
Scraping: https://www.espncricinfo.com/story/ipl-assures-players-of-security-plan-implementation-450522
Scraping: https://www.espncricinfo.com/story/brett-lee-aims-for-full-fitness-during-ipl-450601
Scraping: https://www.espncricinfo.com/story/white-faces-s

Scraping: https://www.espncricinfo.com/story/determined-gambhir-secures-vital-victory-456197
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-deccan-chargers-ipl-chennai-452009
Scraping: https://www.espncricinfo.com/story/-452083
Scraping: https://www.espncricinfo.com/story/deccan-s-tactics-deliver-first-victory-452006
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-delhi-daredevils-ahmedabad-452082
Scraping: https://www.espncricinfo.com/story/coach-greg-shipperd-to-join-delhi-after-sheffield-shield-452039
Scraping: https://www.espncricinfo.com/story/injury-concerns-for-rajasthan-trio-452087
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-plenty-of-dogs-but-no-mongoose-452004
Scraping: https://www.espncricinfo.com/story/virender-sehwag-lights-up-ahmedabad-452081
Scraping: https://www.espncricinfo.com/story/can-the-ipl-lure-lara-617308
Scraping: https://www.espncricinfo.com/story/graeme-smith-fails-to-impose-452084
Scraping: 

Scraping: https://www.espncricinfo.com/story/cool-tendulkar-stars-in-seven-wicket-win-453083
Scraping: https://www.espncricinfo.com/story/new-franchises-likely-to-take-a-hefty-hit-617356
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-tare-fumbles-kartik-goes-blind-453076
Scraping: https://www.espncricinfo.com/story/anand-ramchandran-the-ipl-gets-nationalised-452760
Scraping: https://www.espncricinfo.com/story/uthappa-sets-up-bangalore-s-victory-453217
Scraping: https://www.espncricinfo.com/story/fan-following-mumbai-indians-v-kolkata-knight-riders-ipl-mumbai-452906
Scraping: https://www.espncricinfo.com/story/the-economics-of-the-ipl-617364
Scraping: https://www.espncricinfo.com/story/ipl-bangalore-hold-aces-against-chennai-453052
Scraping: https://www.espncricinfo.com/story/robin-uthappa-bangalore-s-game-changer-453235
Scraping: https://www.espncricinfo.com/story/franchise-breeds-friendship-617369
Scraping: https://www.espncricinfo.com/story/royal-challengers-bangalore-

Scraping: https://www.espncricinfo.com/story/tait-s-misery-and-a-needless-time-out-454222
Scraping: https://www.espncricinfo.com/story/delhi-daredevils-v-rajasthan-royals-ipl-2010-delhi-454210
Scraping: https://www.espncricinfo.com/story/no-room-to-experiment-in-twenty20-harbhajan-454092
Scraping: https://www.espncricinfo.com/story/delhi-dominate-all-fronts-454221
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-royal-challengers-bangalore-ipl-chennai-454189
Scraping: https://www.espncricinfo.com/story/cheerleaders-cheer-for-south-over-north-617424
Scraping: https://www.espncricinfo.com/story/vijay-sizzles-as-chennai-finally-win-454205
Scraping: https://www.espncricinfo.com/story/kolkata-v-deccan-ipl-2010-kolkata-454439
Scraping: https://www.espncricinfo.com/story/anand-ramchandran-the-perils-of-playing-your-natural-game-and-a-marvel-of-technology-454262
Scraping: https://www.espncricinfo.com/story/sourav-ganguly-inspires-kolkata-to-victory-454438
Scrapi

Scraping: https://www.espncricinfo.com/story/missing-ipl-a-blessing-in-disguise-shahid-afridi-455301
Scraping: https://www.espncricinfo.com/story/numbers-game-mumbai-s-one-chink-and-the-gains-from-batting-first-455226
Scraping: https://www.espncricinfo.com/story/punjab-v-mumbai-ipl-2010-mohali-455438
Scraping: https://www.espncricinfo.com/story/punjab-finally-get-their-basics-right-455447
Scraping: https://www.espncricinfo.com/story/chawla-finally-finds-form-for-punjab-455437
Scraping: https://www.espncricinfo.com/story/sambit-bal-tendulkar-transcends-the-format-and-parochialism-455361
Scraping: https://www.espncricinfo.com/story/ablish-s-redemption-and-chawla-s-triumph-455446
Scraping: https://www.espncricinfo.com/story/harsha-bhogle-gambhir-got-a-raw-deal-455085
Scraping: https://www.espncricinfo.com/story/deccan-v-chennai-ipl-2010-nagpur-455521
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-audacious-shots-and-mis-matched-pads-455558
Scraping: https://www.espncricinfo

Scraping: https://www.espncricinfo.com/story/a-lesson-in-civility-and-generosity-617510
Scraping: https://www.espncricinfo.com/story/rayudu-serves-bangalore-notice-456507
Scraping: https://www.espncricinfo.com/story/bangalore-royal-challengers-v-mumbai-indians-bangalore-456443
Scraping: https://www.espncricinfo.com/story/things-to-do-to-reach-the-semi-finals-456466
Scraping: https://www.espncricinfo.com/story/kolkata-knight-riders-v-rajasthan-royals-kolkata-456453
Scraping: https://www.espncricinfo.com/story/nothing-succeeds-like-excess-617509
Scraping: https://www.espncricinfo.com/story/mclaren-chips-in-with-bat-and-ball-456436
Scraping: https://www.espncricinfo.com/story/low-intensity-blasts-outside-chinnaswamy-stadium-456382
Scraping: https://www.espncricinfo.com/story/mcleodganj-a-little-global-village-456449
Scraping: https://www.espncricinfo.com/story/fan-following-bangalore-v-mumbai-indians-ipl-bangalore-456482
Scraping: https://www.espncricinfo.com/story/indian-minister-tharoor

Scraping: https://www.espncricinfo.com/story/all-round-strength-gives-mumbai-the-edge-457119
Scraping: https://www.espncricinfo.com/story/hayden-s-troubles-and-sloppy-mumbai-457313
Scraping: https://www.espncricinfo.com/story/efforts-on-to-force-lalit-modi-s-resignation-457201
Scraping: https://www.espncricinfo.com/story/suresh-raina-turns-it-on-in-the-final-457299
Scraping: https://www.espncricinfo.com/story/modi-s-meteoric-rise-and-fall-457323
Scraping: https://www.espncricinfo.com/story/ian-chappell-beware-the-backlash-against-the-ipl-457107
Scraping: https://www.espncricinfo.com/story/the-cricket-crisis-617546
Scraping: https://www.espncricinfo.com/story/the-key-players-in-the-ipl-controversy-457235
Scraping: https://www.espncricinfo.com/story/dr-doosra-s-demented-devices-the-un-blimp-457198
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-bcci-bitten-by-its-own-buzzword-457282
Scraping: https://www.espncricinfo.com/story/all-ipl-decisions-taken-collectively-modi-457319

Scraping: https://www.espncricinfo.com/story/anand-ramachandran-england-win-the-wrong-world-cup-459991
Scraping: https://www.espncricinfo.com/story/ad-overkill-puts-off-ipl-viewers-460330
Scraping: https://www.espncricinfo.com/story/letting-the-facts-get-in-the-way-617683
Scraping: https://www.espncricinfo.com/story/ipl-3-clean-but-uncertainty-over-previous-editions-icc-460389
Scraping: https://www.espncricinfo.com/story/less-taxing-schedule-the-way-forward-for-the-ipl-617691
Scraping: https://www.espncricinfo.com/story/lalit-modi-s-legal-team-confident-of-positive-response-461016
Scraping: https://www.espncricinfo.com/story/lalit-modi-wants-independent-panel-to-hear-his-case-460914
Scraping: https://www.espncricinfo.com/story/viewers-are-people-too-617714
Scraping: https://www.espncricinfo.com/story/rahul-bhattacharya-the-ipl-s-equality-kills-excellence-461282
Scraping: https://www.espncricinfo.com/story/modi-served-notice-over-ipl-theatre-rights-461577
Scraping: https://www.espncrici

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span.ds-text-tight-xs.ds-text-typo-mid3')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        content = soup.select('.ci-html-content')
        full_content = "\n".join([paragraph.get_text(strip=True) for paragraph in content])

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content if content else "N/A",
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2011.csv"  # Replace with your actual input file path
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles-2011-content.csv"  # Replace with your desired output file path

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/mumbai-v-rajasthan-plays-of-the-day-the-battle-royale-and-a-memorable-last-over-516001
Scraping: https://www.espncricinfo.com/story/ipl-2011-michael-bevan-named-kings-xi-punjab-coach-495533
Scraping: https://www.espncricinfo.com/story/mumbai-indians-v-deccan-chargers-ipl-2011-mumbai-515148
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-arbitrator-stays-rajasthan-royals-expulsion-489806
Scraping: https://www.espncricinfo.com/story/yuvraj-singh-s-all-round-effort-in-vain-511503
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-rca-rajasthan-royals-to-meet-over-warne-comments-515528
Scraping: https://www.espncricinfo.com/story/kolkata-v-kochi-mahela-nostradamus-512044
Scraping: https://www.espncricinfo.com/story/ipl-news-shaun-tait-recall-can-keep-rajasthan-top-shane-watson-513392
Scraping: https://www.espncricinfo.com/story/sizzling-ishant-sucker-punched-kochi-512859
Scraping: https://www.espncricinfo.com/story

Scraping: https://www.espncricinfo.com/story/valthaty-does-a-midas-511335
Scraping: https://www.espncricinfo.com/story/punjab-v-chennai-ipl-2011-mohali-valthaty-graduates-from-school-of-hard-knocks-510997
Scraping: https://www.espncricinfo.com/story/chennai-v-deccan-the-comedy-of-errors-and-the-sunny-sohal-show-513374
Scraping: https://www.espncricinfo.com/story/ian-chappell-why-do-mumbai-have-both-symonds-and-pollard-512332
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-an-opportunity-for-the-ipl-to-clean-itself-up-480849
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-the-bombay-high-court-dismisses-lalit-modi-s-petition-467406
Scraping: https://www.espncricinfo.com/story/valthaty-jumps-into-the-limelight-619397
Scraping: https://www.espncricinfo.com/story/who-is-the-greediest-of-them-all-619441
Scraping: https://www.espncricinfo.com/story/daniel-vettori-asks-for-ipl-window-in-ftp-512671
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-mumbai-in

Scraping: https://www.espncricinfo.com/story/vijay-half-century-powers-chennai-win-514440
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-ipl-beats-cricket-fatigue-worries-511037
Scraping: https://www.espncricinfo.com/story/gayle-s-happy-and-it-shows-619532
Scraping: https://www.espncricinfo.com/story/ipl-auction-2011-a-great-feeling-for-the-pathan-family-says-irfan-pathan-495981
Scraping: https://www.espncricinfo.com/story/kochi-v-delhi-ipl-2011-virender-sehwag-revels-in-responsibility-513207
Scraping: https://www.espncricinfo.com/story/dileep-premachandran-how-do-you-top-a-world-cup-win-510391
Scraping: https://www.espncricinfo.com/story/pune-warriors-v-kochi-tuskers-kerala-ipl-2011-mumbai-510950
Scraping: https://www.espncricinfo.com/story/deccan-chargers-v-mumbai-indians-ipl-2011-hyderabad-512468
Scraping: https://www.espncricinfo.com/story/ganguly-s-back-619494
Scraping: https://www.espncricinfo.com/story/sri-lanka-in-england-2011-tillakaratne-dilshan-to-re

Scraping: https://www.espncricinfo.com/story/plays-of-the-day-the-game-turning-spat-and-rajan-s-initiation-515156
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-chennai-super-kings-ipl-2011-jaipur-514437
Scraping: https://www.espncricinfo.com/story/mumbai-v-kochi-tendulkar-does-a-dhoni-511234
Scraping: https://www.espncricinfo.com/story/winged-creatures-attack-bangalore-test-618349
Scraping: https://www.espncricinfo.com/story/the-inscrutable-silence-of-graeme-smith-619379
Scraping: https://www.espncricinfo.com/story/royal-challengers-bangalore-v-kochi-tuskers-kerala-ipl-2011-bangalore-514332
Scraping: https://www.espncricinfo.com/story/ipl-2011-the-ipl-needs-independent-watchdogs-516834
Scraping: https://www.espncricinfo.com/story/ipl-five-that-could-surprise-495865
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-no-difference-between-ipl-and-bcci-revenue-department-467386
Scraping: https://www.espncricinfo.com/story/india-news-be-moral-leaders-pataudi-tells-

Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-low-key-delhi-hope-to-ride-on-sehwag-510147
Scraping: https://www.espncricinfo.com/story/sri-lanka-in-england-2011-boards-in-talks-over-dilshan-s-ipl-release-date-512552
Scraping: https://www.espncricinfo.com/story/pune-warriors-v-kolkata-knight-riders-ipl-2011-navi-mumbai-515846
Scraping: https://www.espncricinfo.com/story/bangalore-v-mumbai-ipl-2011-chennai-bowling-gives-mumbai-slight-edge-516777
Scraping: https://www.espncricinfo.com/story/hussey-stars-in-easy-chennai-win-513702
Scraping: https://www.espncricinfo.com/story/deccan-v-rajasthan-ipl-2011-hyderabad-510250
Scraping: https://www.espncricinfo.com/story/business-over-sentiment-in-auction-618865
Scraping: https://www.espncricinfo.com/story/rajasthan-v-kochi-team-mates-turned-opponents-and-an-unforgiving-captain-512466
Scraping: https://www.espncricinfo.com/story/why-fans-were-not-emotionally-involved-in-the-2011-ipl-517169
Scraping: https://www.espncricin

Scraping: https://www.espncricinfo.com/story/indian-premier-league-gavaskar-outlines-player-retention-method-466709
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-mumbai-indians-ipl-2011-jaipur-513076
Scraping: https://www.espncricinfo.com/story/ipl-news-ipl-4-to-have-eight-teams-irrespective-of-kochi-fate-483803
Scraping: https://www.espncricinfo.com/story/throwing-out-the-baby-with-the-bath-water-618345
Scraping: https://www.espncricinfo.com/story/rahul-sharma-s-dream-season-continues-514348
Scraping: https://www.espncricinfo.com/story/70th-match-kolkata-v-mumbai-ipl-2011-kolkata-516299
Scraping: https://www.espncricinfo.com/story/bcci-plans-move-to-counter-auction-allegations-474678
Scraping: https://www.espncricinfo.com/story/bangalore-v-punjab-gayle-s-celebration-and-his-version-of-vvs-514115
Scraping: https://www.espncricinfo.com/story/shane-watson-humbles-mumbai-515999
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-rajasthan-verdict-likely-on-monday-4

Scraping: https://www.espncricinfo.com/story/indian-minister-says-signs-of-ipl-teams-violating-corporate-law-470753
Scraping: https://www.espncricinfo.com/story/bangalore-v-punjab-ipl-2011-dharamsala-515623
Scraping: https://www.espncricinfo.com/story/indian-domestic-news-sourav-ganguly-to-play-ranji-trophy-to-prepare-for-ipl-489715
Scraping: https://www.espncricinfo.com/story/kochi-tuskers-kerala-v-kings-xi-punjab-ipl-2011-indore-515022
Scraping: https://www.espncricinfo.com/story/all-round-botha-stars-in-mumbai-hammering-513077
Scraping: https://www.espncricinfo.com/story/sidin-vadukut-test-cricket-haters-come-here-to-be-slapped-518441
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-bangalore-v-chennai-a-day-that-could-have-been-much-worse-516537
Scraping: https://www.espncricinfo.com/story/sohal-stars-in-deccan-victory-511821
Scraping: https://www.espncricinfo.com/story/rajasthan-v-kolkata-botch-ups-at-the-fortress-511199
Scraping: https://www.espncricinfo.com/story/ch

Scraping: https://www.espncricinfo.com/story/ipl-news-chennai-to-host-ipl-opening-game-and-final-501172
Scraping: https://www.espncricinfo.com/story/amin-to-meet-franchise-heads-on-june-24-in-mumbai-462622
Scraping: https://www.espncricinfo.com/story/indian-premier-league-news-bcci-protest-too-facile-to-cut-ice-490270
Scraping: https://www.espncricinfo.com/story/bangalore-v-mumbai-ipl-2011-2nd-qualifier-chennai-516942
Scraping: https://www.espncricinfo.com/story/west-indies-news-wicb-disappointed-at-chris-gayle-s-ipl-decision-511899
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-tv-ratings-plunge-but-viewership-rises-514063
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-mumbai-v-kolkata-disbelieving-stares-and-a-match-turning-he-516719
Scraping: https://www.espncricinfo.com/story/ipl-2011-starts-and-finishes-best-dot-ball-percentages-and-more-515438
Scraping: https://www.espncricinfo.com/story/ipl-2011-anil-kumble-pulls-out-of-ipl-auction-495265


Scraping: https://www.espncricinfo.com/story/kolkata-v-mumbai-eliminator-ipl-2011-mumbai-516715
Scraping: https://www.espncricinfo.com/story/rohit-sharma-unstoppable-in-the-ipl-512465
Scraping: https://www.espncricinfo.com/story/kochi-v-bangalore-ipl-2011-kochi-510270
Scraping: https://www.espncricinfo.com/story/mumbai-and-chennai-in-favour-of-player-retention-464582
Scraping: https://www.espncricinfo.com/story/sehwag-masterclass-lifts-delhi-to-victory-513236
Scraping: https://www.espncricinfo.com/story/ipl-weekly-review-iii-grassy-trysts-nail-biters-and-double-paybacks-513043
Scraping: https://www.espncricinfo.com/story/west-indies-news-players-must-go-through-wicb-to-play-ipl-485482
Scraping: https://www.espncricinfo.com/story/deccan-v-mumbai-the-stumping-the-inevitables-and-vintage-symonds-512478
Scraping: https://www.espncricinfo.com/story/ipl-the-bcci-cancelled-its-ipl-media-rights-deal-with-wsg-465322
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-chennai-super-k

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        # Look for paragraphs within the main content div
        content_div = soup.select_one('div.ds-px-4')  # Ensure we target the correct container first
        if content_div:
            content_paragraphs = content_div.select('p.ds-text-comfortable-l.ci-html-content')  # Target paragraphs
            full_content = "\n".join([p.get_text(strip=True) for p in content_paragraphs]) if content_paragraphs else "N/A"
        else:
            full_content = "N/A"

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content,
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2011.csv"  # Replace with your actual input file path
output_file = "/Users/hemantg/Desktop/dl-project-scraped_articles_2011_dates.csv"  # Replace with your desired output file path

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/mumbai-v-rajasthan-plays-of-the-day-the-battle-royale-and-a-memorable-last-over-516001
Scraping: https://www.espncricinfo.com/story/ipl-2011-michael-bevan-named-kings-xi-punjab-coach-495533
Scraping: https://www.espncricinfo.com/story/mumbai-indians-v-deccan-chargers-ipl-2011-mumbai-515148
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-arbitrator-stays-rajasthan-royals-expulsion-489806
Scraping: https://www.espncricinfo.com/story/yuvraj-singh-s-all-round-effort-in-vain-511503
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-rca-rajasthan-royals-to-meet-over-warne-comments-515528
Scraping: https://www.espncricinfo.com/story/kolkata-v-kochi-mahela-nostradamus-512044
Scraping: https://www.espncricinfo.com/story/ipl-news-shaun-tait-recall-can-keep-rajasthan-top-shane-watson-513392
Scraping: https://www.espncricinfo.com/story/sizzling-ishant-sucker-punched-kochi-512859
Scraping: https://www.espncricinfo.com/story

Scraping: https://www.espncricinfo.com/story/valthaty-does-a-midas-511335
Scraping: https://www.espncricinfo.com/story/punjab-v-chennai-ipl-2011-mohali-valthaty-graduates-from-school-of-hard-knocks-510997
Scraping: https://www.espncricinfo.com/story/chennai-v-deccan-the-comedy-of-errors-and-the-sunny-sohal-show-513374
Scraping: https://www.espncricinfo.com/story/ian-chappell-why-do-mumbai-have-both-symonds-and-pollard-512332
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-an-opportunity-for-the-ipl-to-clean-itself-up-480849
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-the-bombay-high-court-dismisses-lalit-modi-s-petition-467406
Scraping: https://www.espncricinfo.com/story/valthaty-jumps-into-the-limelight-619397
Scraping: https://www.espncricinfo.com/story/who-is-the-greediest-of-them-all-619441
Scraping: https://www.espncricinfo.com/story/daniel-vettori-asks-for-ipl-window-in-ftp-512671
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-mumbai-in

Scraping: https://www.espncricinfo.com/story/vijay-half-century-powers-chennai-win-514440
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-ipl-beats-cricket-fatigue-worries-511037
Scraping: https://www.espncricinfo.com/story/gayle-s-happy-and-it-shows-619532
Scraping: https://www.espncricinfo.com/story/ipl-auction-2011-a-great-feeling-for-the-pathan-family-says-irfan-pathan-495981
Scraping: https://www.espncricinfo.com/story/kochi-v-delhi-ipl-2011-virender-sehwag-revels-in-responsibility-513207
Scraping: https://www.espncricinfo.com/story/dileep-premachandran-how-do-you-top-a-world-cup-win-510391
Scraping: https://www.espncricinfo.com/story/pune-warriors-v-kochi-tuskers-kerala-ipl-2011-mumbai-510950
Scraping: https://www.espncricinfo.com/story/deccan-chargers-v-mumbai-indians-ipl-2011-hyderabad-512468
Scraping: https://www.espncricinfo.com/story/ganguly-s-back-619494
Scraping: https://www.espncricinfo.com/story/sri-lanka-in-england-2011-tillakaratne-dilshan-to-re

Scraping: https://www.espncricinfo.com/story/plays-of-the-day-the-game-turning-spat-and-rajan-s-initiation-515156
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-chennai-super-kings-ipl-2011-jaipur-514437
Scraping: https://www.espncricinfo.com/story/mumbai-v-kochi-tendulkar-does-a-dhoni-511234
Scraping: https://www.espncricinfo.com/story/winged-creatures-attack-bangalore-test-618349
Scraping: https://www.espncricinfo.com/story/the-inscrutable-silence-of-graeme-smith-619379
Scraping: https://www.espncricinfo.com/story/royal-challengers-bangalore-v-kochi-tuskers-kerala-ipl-2011-bangalore-514332
Scraping: https://www.espncricinfo.com/story/ipl-2011-the-ipl-needs-independent-watchdogs-516834
Scraping: https://www.espncricinfo.com/story/ipl-five-that-could-surprise-495865
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-no-difference-between-ipl-and-bcci-revenue-department-467386
Scraping: https://www.espncricinfo.com/story/india-news-be-moral-leaders-pataudi-tells-

Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-low-key-delhi-hope-to-ride-on-sehwag-510147
Scraping: https://www.espncricinfo.com/story/sri-lanka-in-england-2011-boards-in-talks-over-dilshan-s-ipl-release-date-512552
Scraping: https://www.espncricinfo.com/story/pune-warriors-v-kolkata-knight-riders-ipl-2011-navi-mumbai-515846
Scraping: https://www.espncricinfo.com/story/bangalore-v-mumbai-ipl-2011-chennai-bowling-gives-mumbai-slight-edge-516777
Scraping: https://www.espncricinfo.com/story/hussey-stars-in-easy-chennai-win-513702
Scraping: https://www.espncricinfo.com/story/deccan-v-rajasthan-ipl-2011-hyderabad-510250
Scraping: https://www.espncricinfo.com/story/business-over-sentiment-in-auction-618865
Scraping: https://www.espncricinfo.com/story/rajasthan-v-kochi-team-mates-turned-opponents-and-an-unforgiving-captain-512466
Scraping: https://www.espncricinfo.com/story/why-fans-were-not-emotionally-involved-in-the-2011-ipl-517169
Scraping: https://www.espncricin

Scraping: https://www.espncricinfo.com/story/indian-premier-league-gavaskar-outlines-player-retention-method-466709
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-mumbai-indians-ipl-2011-jaipur-513076
Scraping: https://www.espncricinfo.com/story/ipl-news-ipl-4-to-have-eight-teams-irrespective-of-kochi-fate-483803
Scraping: https://www.espncricinfo.com/story/throwing-out-the-baby-with-the-bath-water-618345
Scraping: https://www.espncricinfo.com/story/rahul-sharma-s-dream-season-continues-514348
Scraping: https://www.espncricinfo.com/story/70th-match-kolkata-v-mumbai-ipl-2011-kolkata-516299
Scraping: https://www.espncricinfo.com/story/bcci-plans-move-to-counter-auction-allegations-474678
Scraping: https://www.espncricinfo.com/story/bangalore-v-punjab-gayle-s-celebration-and-his-version-of-vvs-514115
Scraping: https://www.espncricinfo.com/story/shane-watson-humbles-mumbai-515999
Scraping: https://www.espncricinfo.com/story/the-ipl-mess-rajasthan-verdict-likely-on-monday-4

Scraping: https://www.espncricinfo.com/story/indian-minister-says-signs-of-ipl-teams-violating-corporate-law-470753
Scraping: https://www.espncricinfo.com/story/bangalore-v-punjab-ipl-2011-dharamsala-515623
Scraping: https://www.espncricinfo.com/story/indian-domestic-news-sourav-ganguly-to-play-ranji-trophy-to-prepare-for-ipl-489715
Scraping: https://www.espncricinfo.com/story/kochi-tuskers-kerala-v-kings-xi-punjab-ipl-2011-indore-515022
Scraping: https://www.espncricinfo.com/story/all-round-botha-stars-in-mumbai-hammering-513077
Scraping: https://www.espncricinfo.com/story/sidin-vadukut-test-cricket-haters-come-here-to-be-slapped-518441
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-bangalore-v-chennai-a-day-that-could-have-been-much-worse-516537
Scraping: https://www.espncricinfo.com/story/sohal-stars-in-deccan-victory-511821
Scraping: https://www.espncricinfo.com/story/rajasthan-v-kolkata-botch-ups-at-the-fortress-511199
Scraping: https://www.espncricinfo.com/story/ch

Scraping: https://www.espncricinfo.com/story/ipl-news-chennai-to-host-ipl-opening-game-and-final-501172
Scraping: https://www.espncricinfo.com/story/amin-to-meet-franchise-heads-on-june-24-in-mumbai-462622
Scraping: https://www.espncricinfo.com/story/indian-premier-league-news-bcci-protest-too-facile-to-cut-ice-490270
Scraping: https://www.espncricinfo.com/story/bangalore-v-mumbai-ipl-2011-2nd-qualifier-chennai-516942
Scraping: https://www.espncricinfo.com/story/west-indies-news-wicb-disappointed-at-chris-gayle-s-ipl-decision-511899
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-tv-ratings-plunge-but-viewership-rises-514063
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-mumbai-v-kolkata-disbelieving-stares-and-a-match-turning-he-516719
Scraping: https://www.espncricinfo.com/story/ipl-2011-starts-and-finishes-best-dot-ball-percentages-and-more-515438
Scraping: https://www.espncricinfo.com/story/ipl-2011-anil-kumble-pulls-out-of-ipl-auction-495265


Scraping: https://www.espncricinfo.com/story/kolkata-v-mumbai-eliminator-ipl-2011-mumbai-516715
Scraping: https://www.espncricinfo.com/story/rohit-sharma-unstoppable-in-the-ipl-512465
Scraping: https://www.espncricinfo.com/story/kochi-v-bangalore-ipl-2011-kochi-510270
Scraping: https://www.espncricinfo.com/story/mumbai-and-chennai-in-favour-of-player-retention-464582
Scraping: https://www.espncricinfo.com/story/sehwag-masterclass-lifts-delhi-to-victory-513236
Scraping: https://www.espncricinfo.com/story/ipl-weekly-review-iii-grassy-trysts-nail-biters-and-double-paybacks-513043
Scraping: https://www.espncricinfo.com/story/west-indies-news-players-must-go-through-wicb-to-play-ipl-485482
Scraping: https://www.espncricinfo.com/story/deccan-v-mumbai-the-stumping-the-inevitables-and-vintage-symonds-512478
Scraping: https://www.espncricinfo.com/story/ipl-the-bcci-cancelled-its-ipl-media-rights-deal-with-wsg-465322
Scraping: https://www.espncricinfo.com/story/rajasthan-royals-v-chennai-super-k

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2012.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2012_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/india-news-ankle-surgery-rules-ishant-sharma-out-of-ipl-558851
Scraping: https://www.espncricinfo.com/story/why-the-ipl-needs-an-upgrade-620807
Scraping: https://www.espncricinfo.com/story/ipl-news-kochi-to-appeal-against-court-s-rejection-533248
Scraping: https://www.espncricinfo.com/story/kolkata-v-chennai-ipl-2012-final-chennai-how-dhoni-and-gambhir-have-shaped-their-sides-566320
Scraping: https://www.espncricinfo.com/story/bbq-stories-for-the-rest-of-your-life-621025
Scraping: https://www.espncricinfo.com/story/ipl-news-mumbai-indians-richard-levi-excited-by-ipl-opportunityrichard-levi-wants-to-master-indian-557082
Scraping: https://www.espncricinfo.com/story/andrew-leipus-on-avoiding-shoulder-problems-561209
Scraping: https://www.espncricinfo.com/story/ipl-news-not-sure-about-my-role-in-the-future-rahul-dravid-565687
Scraping: https://www.espncricinfo.com/story/india-news-rajiv-shukla-tipped-to-replace-chirayu-amin-as-ipl-chairman-53274

Scraping: https://www.espncricinfo.com/story/promoting-a-club-culture-621395
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2012-deccan-chargers-sign-kenya-s-tanmay-mishra-as-indian-554185
Scraping: https://www.espncricinfo.com/story/time-out-the-ipl-needs-more-professional-governance-567305
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2011-12-pragyan-ojha-moves-to-mumbai-indians-549419
Scraping: https://www.espncricinfo.com/story/ipl-news-ipl-2012-to-begin-on-april-4-548395
Scraping: https://www.espncricinfo.com/story/ipl-2012-the-rahane-shah-stand-and-munaf-s-eight-561447
Scraping: https://www.espncricinfo.com/story/ipl-2012-performance-analysis-sehwag-makes-his-move-563364
Scraping: https://www.espncricinfo.com/story/ipl-2012-steven-smith-and-robin-uthappa-shine-for-pune-warriors-560490
Scraping: https://www.espncricinfo.com/story/ipl-news-franchises-oppose-foreign-player-allowance-to-pune-554084
Scraping: https://www.espncricinfo.com/story/

Scraping: https://www.espncricinfo.com/story/ipl-2012-performance-analysis-rahane-dominates-spinners-take-top-spots-561449
Scraping: https://www.espncricinfo.com/story/ipl-continues-to-unbalance-the-senses-621373
Scraping: https://www.espncricinfo.com/story/ipl-news-sourav-ganguly-s-final-ipl-year-pune-owner-564403
Scraping: https://www.espncricinfo.com/story/ipl-2012-cameos-who-pressed-the-nitro-boost-button-566483
Scraping: https://www.espncricinfo.com/story/ipl-2012-moments-small-guys-big-stage-566660
Scraping: https://www.espncricinfo.com/story/ipl-2012-michael-clarke-signs-with-pune-warriors-559020
Scraping: https://www.espncricinfo.com/story/play-five-foreign-players-to-help-indian-cricket-621018
Scraping: https://www.espncricinfo.com/story/ipl-governing-council-postpones-decision-on-kochi-replacement-536562
Scraping: https://www.espncricinfo.com/story/fan-following-delhi-daredevils-v-kolkata-knight-riders-first-qualifier-ipl-pune-565902
Scraping: https://www.espncricinfo.com/sto

Scraping: https://www.espncricinfo.com/story/indian-premier-league-2012-pity-if-pune-warriors-don-t-play-allan-donald-552375
Scraping: https://www.espncricinfo.com/story/andrew-leipus-on-the-kolkata-knight-riders-ipl-2012-season-567057
Scraping: https://www.espncricinfo.com/story/it-s-a-cosy-little-club-621404
Scraping: https://www.espncricinfo.com/story/indian-premier-league-2012-pune-s-shopping-list-554824
Scraping: https://www.espncricinfo.com/story/india-news-bcci-sahara-likely-to-soften-stance-at-meeting-552952
Scraping: https://www.espncricinfo.com/story/the-ipl-is-bad-for-capitalism-democracy-and-cricket-621349
Scraping: https://www.espncricinfo.com/story/punjab-v-mumbai-ipl-2012-mohali-the-over-that-cost-punjab-the-match-562628
Scraping: https://www.espncricinfo.com/story/ipl-2012-season-review-kings-xi-punjab-fall-just-short-again-565516
Scraping: https://www.espncricinfo.com/story/rcb-v-kxip-ipl-2012-bangalore-adam-gilchrist-praises-david-hussey-s-strong-leadership-563601
Scr

Scraping: https://www.espncricinfo.com/story/ipl-2012-late-wankhede-pitch-change-cost-rajasthan-sreevats-goswami-561128
Scraping: https://www.espncricinfo.com/story/ipl-2012-mumbai-indians-add-five-uncapped-players-554148
Scraping: https://www.espncricinfo.com/story/ipl-2012-rajasthan-have-new-captain-and-more-batsmen-559658
Scraping: https://www.espncricinfo.com/story/fan-following-rajasthan-royals-v-mumbai-indians-ipl-2012-jaipur-565642
Scraping: https://www.espncricinfo.com/story/indian-premier-league-news-eric-simons-named-delhi-daredevils-coach-546588
Scraping: https://www.espncricinfo.com/story/deccan-chargers-v-royal-challengers-bangalore-poor-shot-selection-cost-us-playoff-spot-kohli-565648
Scraping: https://www.espncricinfo.com/story/mumbai-indians-v-chennai-super-kings-ipl-dwayne-smith-backed-himself-to-score-14-from-last-three-564029
Scraping: https://www.espncricinfo.com/story/deccan-chargers-v-mumbai-indians-ipl-2012-wounded-chargers-take-on-hurt-mumbai-indians-560370
Scra

Scraping: https://www.espncricinfo.com/story/ipl-news-i-m-helping-younger-bowlers-deal-with-pressure-morne-morkel-565659
Scraping: https://www.espncricinfo.com/story/time-for-a-strategic-break-out-621387
Scraping: https://www.espncricinfo.com/story/ipl-news-ipl-player-auction-2012-551994
Scraping: https://www.espncricinfo.com/story/ipl-not-a-conspiracy-against-test-cricket-621268
Scraping: https://www.espncricinfo.com/story/quintessentially-combative-ganguly-lifts-pune-621058
Scraping: https://www.espncricinfo.com/story/the-unique-appeal-of-the-champions-league-620257
Scraping: https://www.espncricinfo.com/story/newsfile-april-27-621135
Scraping: https://www.espncricinfo.com/story/ipl-news-gautam-gambhir-urges-franchise-to-control-their-players-566891
Scraping: https://www.espncricinfo.com/story/india-news-mohnish-mishra-admits-to-making-frivolous-statements-564948
Scraping: https://www.espncricinfo.com/story/the-numbers-game-ipl-s-most-and-least-productive-overs-563633
Scraping: https

In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span.ds-text-tight-xs.ds-text-typo-mid3')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        content = soup.select('.ci-html-content')
        full_content = "\n".join([paragraph.get_text(strip=True) for paragraph in content])

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content if content else "N/A",
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2013.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2013_content.csv"

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/mitchell-johnson-thriving-in-the-fast-lane-633180
Scraping: https://www.espncricinfo.com/story/rajasthan-v-chennai-saving-the-best-for-the-last-635610
Scraping: https://www.espncricinfo.com/story/sachin-tendulkar-emphasises-on-situational-awareness-in-t20s-632527
Scraping: https://www.espncricinfo.com/story/the-case-for-forced-player-rotation-in-the-ipl-638165
Scraping: https://www.espncricinfo.com/story/the-surprise-no-8-633871
Scraping: https://www.espncricinfo.com/story/sreesanth-and-others-custody-extended-till-june-18-638951
Scraping: https://www.espncricinfo.com/story/aakash-chopra-dhoni-and-the-art-of-controlling-an-ipl-match-633784
Scraping: https://www.espncricinfo.com/story/ipl-schedule-may-undergo-tweak-626110
Scraping: https://www.espncricinfo.com/story/aakash-chopra-unravelling-the-narine-mystery-629562
Scraping: https://www.espncricinfo.com/story/andrew-hughes-on-the-final-of-the-indian-denial-league-638132
Scraping: https://ww

Scraping: https://www.espncricinfo.com/story/will-fans-really-stop-watching-the-ipl-636933
Scraping: https://www.espncricinfo.com/story/quinton-de-kock-replaces-faf-du-plessis-for-last-two-odis-625803
Scraping: https://www.espncricinfo.com/story/bhuvneshwar-kumar-impresses-allan-donald-627988
Scraping: https://www.espncricinfo.com/story/the-praveen-and-awana-show-635407
Scraping: https://www.espncricinfo.com/story/harbhajan-singh-s-send-off-to-yusuf-pathan-and-kieron-pollard-s-anguish-632020
Scraping: https://www.espncricinfo.com/story/numbers-game-ipl-s-two-halves-and-its-most-productive-overs-634956
Scraping: https://www.espncricinfo.com/story/michael-clarke-ricky-ponting-get-maximum-reserve-price-602845
Scraping: https://www.espncricinfo.com/story/pollard-v-bravo-part-ii-628507
Scraping: https://www.espncricinfo.com/story/david-miller-lives-up-to-his-potential-634429
Scraping: https://www.espncricinfo.com/story/the-fine-art-of-balancing-schedules-and-commercialism-628888
Scraping: h

Scraping: https://www.espncricinfo.com/story/after-pollard-got-out-seemed-like-mumbai-were-all-out-ravindra-jadeja-637142
Scraping: https://www.espncricinfo.com/story/sunrisers-showed-character-tom-moody-636562
Scraping: https://www.espncricinfo.com/story/fan-following-mumbai-indians-v-kolkata-knight-riders-ipl-2013-mumbai-634635
Scraping: https://www.espncricinfo.com/story/michel-van-oorschot-learning-from-eoin-morgan-630728
Scraping: https://www.espncricinfo.com/story/an-evening-of-altercations-in-bangalore-629396
Scraping: https://www.espncricinfo.com/story/david-hopps-hate-it-tolerate-it-england-can-t-quite-ignore-the-ipl-630904
Scraping: https://www.espncricinfo.com/story/who-was-sold-to-whom-603172
Scraping: https://www.espncricinfo.com/story/ipl-playoffs-likely-to-shift-out-of-chennai-631210
Scraping: https://www.espncricinfo.com/story/time-to-play-fearless-cricket-gautam-gambhir-627686
Scraping: https://www.espncricinfo.com/story/gayle-s-cool-fall-and-sreesanth-s-reprieve-63295

Scraping: https://www.espncricinfo.com/story/amol-karhadkar-on-the-increasingly-important-role-of-the-performance-analyst-in-t20-635622
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-delhi-daredevils-ipl-2013-chennai-635972
Scraping: https://www.espncricinfo.com/story/ipl-focus-a-worrying-trend-622451
Scraping: https://www.espncricinfo.com/story/bangalore-s-annual-loyalty-test-631457
Scraping: https://www.espncricinfo.com/story/onus-on-the-ipl-owners-639315
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-parthiv-patel-s-innings-seals-playoff-spot-for-hyderabad-636891
Scraping: https://www.espncricinfo.com/story/chris-morris-sheds-nerves-for-debut-success-629241
Scraping: https://www.espncricinfo.com/story/n-srinivasan-steps-aside-temporarily-dalmiya-to-run-bcci-affairs-638730
Scraping: https://www.espncricinfo.com/story/gilchrist-fails-to-show-up-again-631890
Scraping: https://www.espncricinfo.com/story/collapse-tells-bad-story-of-rc

Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-kieron-pollard-blitz-leaves-hyderabad-undone-635768
Scraping: https://www.espncricinfo.com/story/dravid-happy-with-sreesanth-s-response-630064
Scraping: https://www.espncricinfo.com/story/bcci-suspends-gurunath-meiyappan-pending-investigations-637788
Scraping: https://www.espncricinfo.com/story/gurunath-meiyappan-lands-in-mumbai-for-questioning-637518
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-ms-dhoni-s-onslaught-keeps-chennai-in-top-spot-633291
Scraping: https://www.espncricinfo.com/story/bowling-yorkers-slower-ones-my-strength-unadkat-635214
Scraping: https://www.espncricinfo.com/story/ipl-to-meet-over-disputed-pune-payment-601765
Scraping: https://www.espncricinfo.com/story/live-updates-bcci-meeting-on-ipl-spot-fixing-crisis-636744
Scraping: https://www.espncricinfo.com/story/chris-gayle-marries-brain-with-brawn-629501
Scraping: https://www.espncricinfo.com/story/fan-following-royal-challenge

Scraping: https://www.espncricinfo.com/story/another-season-as-also-rans-628040
Scraping: https://www.espncricinfo.com/story/you-re-calling-me-a-cheat-634948
Scraping: https://www.espncricinfo.com/story/ipl-franchises-still-in-the-red-627781
Scraping: https://www.espncricinfo.com/story/fan-following-sunrisers-hyderabad-v-rajasthan-royals-ipl-2013-hyderabad-636554
Scraping: https://www.espncricinfo.com/story/dinesh-chandimal-rules-himself-out-of-ipl-627363
Scraping: https://www.espncricinfo.com/story/badgers-stop-play-pitbull-starts-it-628466
Scraping: https://www.espncricinfo.com/story/happy-with-where-ms-dhoni-batted-stephen-fleming-638003
Scraping: https://www.espncricinfo.com/story/death-bowling-in-focus-for-rcb-630238
Scraping: https://www.espncricinfo.com/story/nagraj-gollapudi-on-amit-mishra-s-comeback-634671
Scraping: https://www.espncricinfo.com/story/controversy-mars-another-top-csk-showing-638042
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-on-lalit-modi-a-pio

Scraping: https://www.espncricinfo.com/story/harmeet-singh-questioned-in-spot-fixing-probe-648249
Scraping: https://www.espncricinfo.com/story/ed-smith-what-the-ipl-can-learn-from-champions-league-football-638209
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-the-sound-of-silence-636576
Scraping: https://www.espncricinfo.com/story/bcci-working-committee-to-meet-next-week-638486
Scraping: https://www.espncricinfo.com/story/csk-making-effort-to-improve-away-record-stephen-fleming-634095
Scraping: https://www.espncricinfo.com/story/everything-went-right-for-me-today-chris-gayle-631928
Scraping: https://www.espncricinfo.com/story/rohit-sharma-s-moment-of-reckoning-638140
Scraping: https://www.espncricinfo.com/story/and-the-ipl-machine-continues-636588
Scraping: https://www.espncricinfo.com/story/fan-following-rajasthan-royals-v-sunrisers-hyderabad-ipl-2013-jaipur-632757
Scraping: https://www.espncricinfo.com/story/t20-cricket-expect-the-unexpected-635519
Scraping: https://www

Scraping: https://www.espncricinfo.com/story/angelo-mathews-named-pune-warriors-captain-627151
Scraping: https://www.espncricinfo.com/story/mushtaq-ahmed-set-to-join-delhi-daredevils-for-ipl-2013-604475
Scraping: https://www.espncricinfo.com/story/brad-hodge-thrilled-with-win-after-emotional-week-637314
Scraping: https://www.espncricinfo.com/story/the-fickle-finger-of-franchise-favour-622591
Scraping: https://www.espncricinfo.com/story/qualification-scenarios-for-ipl-2013-seven-teams-four-slots-635638
Scraping: https://www.espncricinfo.com/story/expunge-from-records-if-found-guilty-aakash-chopra-636911
Scraping: https://www.espncricinfo.com/story/ipl-2013-the-importance-of-indian-captains-in-the-ipl-634829
Scraping: https://www.espncricinfo.com/story/ben-rohrer-live-from-the-super-over-630546
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-tendulkar-v-steyn-633298
Scraping: https://www.espncricinfo.com/story/dhawan-awaiting-nca-clearance-to-play-in-ipl-630031
Scraping: ht

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2013.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2013_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/mitchell-johnson-thriving-in-the-fast-lane-633180
Scraping: https://www.espncricinfo.com/story/rajasthan-v-chennai-saving-the-best-for-the-last-635610
Scraping: https://www.espncricinfo.com/story/sachin-tendulkar-emphasises-on-situational-awareness-in-t20s-632527
Scraping: https://www.espncricinfo.com/story/the-case-for-forced-player-rotation-in-the-ipl-638165
Scraping: https://www.espncricinfo.com/story/the-surprise-no-8-633871
Scraping: https://www.espncricinfo.com/story/sreesanth-and-others-custody-extended-till-june-18-638951
Scraping: https://www.espncricinfo.com/story/aakash-chopra-dhoni-and-the-art-of-controlling-an-ipl-match-633784
Scraping: https://www.espncricinfo.com/story/ipl-schedule-may-undergo-tweak-626110
Scraping: https://www.espncricinfo.com/story/aakash-chopra-unravelling-the-narine-mystery-629562
Scraping: https://www.espncricinfo.com/story/andrew-hughes-on-the-final-of-the-indian-denial-league-638132
Scraping: https://ww

Scraping: https://www.espncricinfo.com/story/will-fans-really-stop-watching-the-ipl-636933
Scraping: https://www.espncricinfo.com/story/quinton-de-kock-replaces-faf-du-plessis-for-last-two-odis-625803
Scraping: https://www.espncricinfo.com/story/bhuvneshwar-kumar-impresses-allan-donald-627988
Scraping: https://www.espncricinfo.com/story/the-praveen-and-awana-show-635407
Scraping: https://www.espncricinfo.com/story/harbhajan-singh-s-send-off-to-yusuf-pathan-and-kieron-pollard-s-anguish-632020
Scraping: https://www.espncricinfo.com/story/numbers-game-ipl-s-two-halves-and-its-most-productive-overs-634956
Scraping: https://www.espncricinfo.com/story/michael-clarke-ricky-ponting-get-maximum-reserve-price-602845
Scraping: https://www.espncricinfo.com/story/pollard-v-bravo-part-ii-628507
Scraping: https://www.espncricinfo.com/story/david-miller-lives-up-to-his-potential-634429
Scraping: https://www.espncricinfo.com/story/the-fine-art-of-balancing-schedules-and-commercialism-628888
Scraping: h

Scraping: https://www.espncricinfo.com/story/after-pollard-got-out-seemed-like-mumbai-were-all-out-ravindra-jadeja-637142
Scraping: https://www.espncricinfo.com/story/sunrisers-showed-character-tom-moody-636562
Scraping: https://www.espncricinfo.com/story/fan-following-mumbai-indians-v-kolkata-knight-riders-ipl-2013-mumbai-634635
Scraping: https://www.espncricinfo.com/story/michel-van-oorschot-learning-from-eoin-morgan-630728
Scraping: https://www.espncricinfo.com/story/an-evening-of-altercations-in-bangalore-629396
Scraping: https://www.espncricinfo.com/story/david-hopps-hate-it-tolerate-it-england-can-t-quite-ignore-the-ipl-630904
Scraping: https://www.espncricinfo.com/story/who-was-sold-to-whom-603172
Scraping: https://www.espncricinfo.com/story/ipl-playoffs-likely-to-shift-out-of-chennai-631210
Scraping: https://www.espncricinfo.com/story/time-to-play-fearless-cricket-gautam-gambhir-627686
Scraping: https://www.espncricinfo.com/story/gayle-s-cool-fall-and-sreesanth-s-reprieve-63295

Scraping: https://www.espncricinfo.com/story/amol-karhadkar-on-the-increasingly-important-role-of-the-performance-analyst-in-t20-635622
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-delhi-daredevils-ipl-2013-chennai-635972
Scraping: https://www.espncricinfo.com/story/ipl-focus-a-worrying-trend-622451
Scraping: https://www.espncricinfo.com/story/bangalore-s-annual-loyalty-test-631457
Scraping: https://www.espncricinfo.com/story/onus-on-the-ipl-owners-639315
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-parthiv-patel-s-innings-seals-playoff-spot-for-hyderabad-636891
Scraping: https://www.espncricinfo.com/story/chris-morris-sheds-nerves-for-debut-success-629241
Scraping: https://www.espncricinfo.com/story/n-srinivasan-steps-aside-temporarily-dalmiya-to-run-bcci-affairs-638730
Scraping: https://www.espncricinfo.com/story/gilchrist-fails-to-show-up-again-631890
Scraping: https://www.espncricinfo.com/story/collapse-tells-bad-story-of-rc

Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-kieron-pollard-blitz-leaves-hyderabad-undone-635768
Scraping: https://www.espncricinfo.com/story/dravid-happy-with-sreesanth-s-response-630064
Scraping: https://www.espncricinfo.com/story/bcci-suspends-gurunath-meiyappan-pending-investigations-637788
Scraping: https://www.espncricinfo.com/story/gurunath-meiyappan-lands-in-mumbai-for-questioning-637518
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-ms-dhoni-s-onslaught-keeps-chennai-in-top-spot-633291
Scraping: https://www.espncricinfo.com/story/bowling-yorkers-slower-ones-my-strength-unadkat-635214
Scraping: https://www.espncricinfo.com/story/ipl-to-meet-over-disputed-pune-payment-601765
Scraping: https://www.espncricinfo.com/story/live-updates-bcci-meeting-on-ipl-spot-fixing-crisis-636744
Scraping: https://www.espncricinfo.com/story/chris-gayle-marries-brain-with-brawn-629501
Scraping: https://www.espncricinfo.com/story/fan-following-royal-challenge

Scraping: https://www.espncricinfo.com/story/another-season-as-also-rans-628040
Scraping: https://www.espncricinfo.com/story/you-re-calling-me-a-cheat-634948
Scraping: https://www.espncricinfo.com/story/ipl-franchises-still-in-the-red-627781
Scraping: https://www.espncricinfo.com/story/fan-following-sunrisers-hyderabad-v-rajasthan-royals-ipl-2013-hyderabad-636554
Scraping: https://www.espncricinfo.com/story/dinesh-chandimal-rules-himself-out-of-ipl-627363
Scraping: https://www.espncricinfo.com/story/badgers-stop-play-pitbull-starts-it-628466
Scraping: https://www.espncricinfo.com/story/happy-with-where-ms-dhoni-batted-stephen-fleming-638003
Scraping: https://www.espncricinfo.com/story/death-bowling-in-focus-for-rcb-630238
Scraping: https://www.espncricinfo.com/story/nagraj-gollapudi-on-amit-mishra-s-comeback-634671
Scraping: https://www.espncricinfo.com/story/controversy-mars-another-top-csk-showing-638042
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-on-lalit-modi-a-pio

Scraping: https://www.espncricinfo.com/story/harmeet-singh-questioned-in-spot-fixing-probe-648249
Scraping: https://www.espncricinfo.com/story/ed-smith-what-the-ipl-can-learn-from-champions-league-football-638209
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-the-sound-of-silence-636576
Scraping: https://www.espncricinfo.com/story/bcci-working-committee-to-meet-next-week-638486
Scraping: https://www.espncricinfo.com/story/csk-making-effort-to-improve-away-record-stephen-fleming-634095
Scraping: https://www.espncricinfo.com/story/everything-went-right-for-me-today-chris-gayle-631928
Scraping: https://www.espncricinfo.com/story/rohit-sharma-s-moment-of-reckoning-638140
Scraping: https://www.espncricinfo.com/story/and-the-ipl-machine-continues-636588
Scraping: https://www.espncricinfo.com/story/fan-following-rajasthan-royals-v-sunrisers-hyderabad-ipl-2013-jaipur-632757
Scraping: https://www.espncricinfo.com/story/t20-cricket-expect-the-unexpected-635519
Scraping: https://www

Scraping: https://www.espncricinfo.com/story/angelo-mathews-named-pune-warriors-captain-627151
Scraping: https://www.espncricinfo.com/story/mushtaq-ahmed-set-to-join-delhi-daredevils-for-ipl-2013-604475
Scraping: https://www.espncricinfo.com/story/brad-hodge-thrilled-with-win-after-emotional-week-637314
Scraping: https://www.espncricinfo.com/story/the-fickle-finger-of-franchise-favour-622591
Scraping: https://www.espncricinfo.com/story/qualification-scenarios-for-ipl-2013-seven-teams-four-slots-635638
Scraping: https://www.espncricinfo.com/story/expunge-from-records-if-found-guilty-aakash-chopra-636911
Scraping: https://www.espncricinfo.com/story/ipl-2013-the-importance-of-indian-captains-in-the-ipl-634829
Scraping: https://www.espncricinfo.com/story/ben-rohrer-live-from-the-super-over-630546
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-tendulkar-v-steyn-633298
Scraping: https://www.espncricinfo.com/story/dhawan-awaiting-nca-clearance-to-play-in-ipl-630031
Scraping: ht

In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        # Look for paragraphs within the main content div
        content_div = soup.select_one('div.ds-px-4')  # Ensure we target the correct container first
        if content_div:
            content_paragraphs = content_div.select('p.ds-text-comfortable-l.ci-html-content')  # Target paragraphs
            full_content = "\n".join([p.get_text(strip=True) for p in content_paragraphs]) if content_paragraphs else "N/A"
        else:
            full_content = "N/A"

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content,
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2013.csv"  # Replace with your actual input file path
output_file = "/Users/hemantg/Desktop/dl-project-scraped_articles_2013_dates.csv"  # Replace with your desired output file path

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/mitchell-johnson-thriving-in-the-fast-lane-633180
Scraping: https://www.espncricinfo.com/story/rajasthan-v-chennai-saving-the-best-for-the-last-635610
Scraping: https://www.espncricinfo.com/story/sachin-tendulkar-emphasises-on-situational-awareness-in-t20s-632527
Scraping: https://www.espncricinfo.com/story/the-case-for-forced-player-rotation-in-the-ipl-638165
Scraping: https://www.espncricinfo.com/story/the-surprise-no-8-633871
Scraping: https://www.espncricinfo.com/story/sreesanth-and-others-custody-extended-till-june-18-638951
Scraping: https://www.espncricinfo.com/story/aakash-chopra-dhoni-and-the-art-of-controlling-an-ipl-match-633784
Scraping: https://www.espncricinfo.com/story/ipl-schedule-may-undergo-tweak-626110
Scraping: https://www.espncricinfo.com/story/aakash-chopra-unravelling-the-narine-mystery-629562
Scraping: https://www.espncricinfo.com/story/andrew-hughes-on-the-final-of-the-indian-denial-league-638132
Scraping: https://ww

Scraping: https://www.espncricinfo.com/story/will-fans-really-stop-watching-the-ipl-636933
Scraping: https://www.espncricinfo.com/story/quinton-de-kock-replaces-faf-du-plessis-for-last-two-odis-625803
Scraping: https://www.espncricinfo.com/story/bhuvneshwar-kumar-impresses-allan-donald-627988
Scraping: https://www.espncricinfo.com/story/the-praveen-and-awana-show-635407
Scraping: https://www.espncricinfo.com/story/harbhajan-singh-s-send-off-to-yusuf-pathan-and-kieron-pollard-s-anguish-632020
Scraping: https://www.espncricinfo.com/story/numbers-game-ipl-s-two-halves-and-its-most-productive-overs-634956
Scraping: https://www.espncricinfo.com/story/michael-clarke-ricky-ponting-get-maximum-reserve-price-602845
Scraping: https://www.espncricinfo.com/story/pollard-v-bravo-part-ii-628507
Scraping: https://www.espncricinfo.com/story/david-miller-lives-up-to-his-potential-634429
Scraping: https://www.espncricinfo.com/story/the-fine-art-of-balancing-schedules-and-commercialism-628888
Scraping: h

Scraping: https://www.espncricinfo.com/story/after-pollard-got-out-seemed-like-mumbai-were-all-out-ravindra-jadeja-637142
Scraping: https://www.espncricinfo.com/story/sunrisers-showed-character-tom-moody-636562
Scraping: https://www.espncricinfo.com/story/fan-following-mumbai-indians-v-kolkata-knight-riders-ipl-2013-mumbai-634635
Scraping: https://www.espncricinfo.com/story/michel-van-oorschot-learning-from-eoin-morgan-630728
Scraping: https://www.espncricinfo.com/story/an-evening-of-altercations-in-bangalore-629396
Scraping: https://www.espncricinfo.com/story/david-hopps-hate-it-tolerate-it-england-can-t-quite-ignore-the-ipl-630904
Scraping: https://www.espncricinfo.com/story/who-was-sold-to-whom-603172
Scraping: https://www.espncricinfo.com/story/ipl-playoffs-likely-to-shift-out-of-chennai-631210
Scraping: https://www.espncricinfo.com/story/time-to-play-fearless-cricket-gautam-gambhir-627686
Scraping: https://www.espncricinfo.com/story/gayle-s-cool-fall-and-sreesanth-s-reprieve-63295

Scraping: https://www.espncricinfo.com/story/amol-karhadkar-on-the-increasingly-important-role-of-the-performance-analyst-in-t20-635622
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-delhi-daredevils-ipl-2013-chennai-635972
Scraping: https://www.espncricinfo.com/story/ipl-focus-a-worrying-trend-622451
Scraping: https://www.espncricinfo.com/story/bangalore-s-annual-loyalty-test-631457
Scraping: https://www.espncricinfo.com/story/onus-on-the-ipl-owners-639315
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-parthiv-patel-s-innings-seals-playoff-spot-for-hyderabad-636891
Scraping: https://www.espncricinfo.com/story/chris-morris-sheds-nerves-for-debut-success-629241
Scraping: https://www.espncricinfo.com/story/n-srinivasan-steps-aside-temporarily-dalmiya-to-run-bcci-affairs-638730
Scraping: https://www.espncricinfo.com/story/gilchrist-fails-to-show-up-again-631890
Scraping: https://www.espncricinfo.com/story/collapse-tells-bad-story-of-rc

Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-kieron-pollard-blitz-leaves-hyderabad-undone-635768
Scraping: https://www.espncricinfo.com/story/dravid-happy-with-sreesanth-s-response-630064
Scraping: https://www.espncricinfo.com/story/bcci-suspends-gurunath-meiyappan-pending-investigations-637788
Scraping: https://www.espncricinfo.com/story/gurunath-meiyappan-lands-in-mumbai-for-questioning-637518
Scraping: https://www.espncricinfo.com/story/indian-t20-tournament-ms-dhoni-s-onslaught-keeps-chennai-in-top-spot-633291
Scraping: https://www.espncricinfo.com/story/bowling-yorkers-slower-ones-my-strength-unadkat-635214
Scraping: https://www.espncricinfo.com/story/ipl-to-meet-over-disputed-pune-payment-601765
Scraping: https://www.espncricinfo.com/story/live-updates-bcci-meeting-on-ipl-spot-fixing-crisis-636744
Scraping: https://www.espncricinfo.com/story/chris-gayle-marries-brain-with-brawn-629501
Scraping: https://www.espncricinfo.com/story/fan-following-royal-challenge

Scraping: https://www.espncricinfo.com/story/another-season-as-also-rans-628040
Scraping: https://www.espncricinfo.com/story/you-re-calling-me-a-cheat-634948
Scraping: https://www.espncricinfo.com/story/ipl-franchises-still-in-the-red-627781
Scraping: https://www.espncricinfo.com/story/fan-following-sunrisers-hyderabad-v-rajasthan-royals-ipl-2013-hyderabad-636554
Scraping: https://www.espncricinfo.com/story/dinesh-chandimal-rules-himself-out-of-ipl-627363
Scraping: https://www.espncricinfo.com/story/badgers-stop-play-pitbull-starts-it-628466
Scraping: https://www.espncricinfo.com/story/happy-with-where-ms-dhoni-batted-stephen-fleming-638003
Scraping: https://www.espncricinfo.com/story/death-bowling-in-focus-for-rcb-630238
Scraping: https://www.espncricinfo.com/story/nagraj-gollapudi-on-amit-mishra-s-comeback-634671
Scraping: https://www.espncricinfo.com/story/controversy-mars-another-top-csk-showing-638042
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-on-lalit-modi-a-pio

Scraping: https://www.espncricinfo.com/story/harmeet-singh-questioned-in-spot-fixing-probe-648249
Scraping: https://www.espncricinfo.com/story/ed-smith-what-the-ipl-can-learn-from-champions-league-football-638209
Scraping: https://www.espncricinfo.com/story/jayaditya-gupta-the-sound-of-silence-636576
Scraping: https://www.espncricinfo.com/story/bcci-working-committee-to-meet-next-week-638486
Scraping: https://www.espncricinfo.com/story/csk-making-effort-to-improve-away-record-stephen-fleming-634095
Scraping: https://www.espncricinfo.com/story/everything-went-right-for-me-today-chris-gayle-631928
Scraping: https://www.espncricinfo.com/story/rohit-sharma-s-moment-of-reckoning-638140
Scraping: https://www.espncricinfo.com/story/and-the-ipl-machine-continues-636588
Scraping: https://www.espncricinfo.com/story/fan-following-rajasthan-royals-v-sunrisers-hyderabad-ipl-2013-jaipur-632757
Scraping: https://www.espncricinfo.com/story/t20-cricket-expect-the-unexpected-635519
Scraping: https://www

Scraping: https://www.espncricinfo.com/story/angelo-mathews-named-pune-warriors-captain-627151
Scraping: https://www.espncricinfo.com/story/mushtaq-ahmed-set-to-join-delhi-daredevils-for-ipl-2013-604475
Scraping: https://www.espncricinfo.com/story/brad-hodge-thrilled-with-win-after-emotional-week-637314
Scraping: https://www.espncricinfo.com/story/the-fickle-finger-of-franchise-favour-622591
Scraping: https://www.espncricinfo.com/story/qualification-scenarios-for-ipl-2013-seven-teams-four-slots-635638
Scraping: https://www.espncricinfo.com/story/expunge-from-records-if-found-guilty-aakash-chopra-636911
Scraping: https://www.espncricinfo.com/story/ipl-2013-the-importance-of-indian-captains-in-the-ipl-634829
Scraping: https://www.espncricinfo.com/story/ben-rohrer-live-from-the-super-over-630546
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-tendulkar-v-steyn-633298
Scraping: https://www.espncricinfo.com/story/dhawan-awaiting-nca-clearance-to-play-in-ipl-630031
Scraping: ht

In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2014.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2014_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/ganguly-advises-captain-kohli-and-praveen-s-nightmare-743617
Scraping: https://www.espncricinfo.com/story/rcb-lodge-complaint-over-yuvraj-bid-718405
Scraping: https://www.espncricinfo.com/story/ipl-longlist-features-651-uncapped-players-714115
Scraping: https://www.espncricinfo.com/story/george-bailey-to-captain-kings-xi-punjab-728575
Scraping: https://www.espncricinfo.com/story/james-marsh-the-healing-powers-of-the-ipl-750437
Scraping: https://www.espncricinfo.com/story/seven-off-one-ball-739943
Scraping: https://www.espncricinfo.com/story/mundane-second-half-showing-mutes-mumbai-s-roar-748585
Scraping: https://www.espncricinfo.com/story/andrew-hughes-dear-cricket-snobs-744513
Scraping: https://www.espncricinfo.com/story/andrew-hughes-how-do-you-consistently-turn-victory-into-defeat-746383
Scraping: https://www.espncricinfo.com/story/cummins-ends-long-wait-with-swift-strike-746295
Scraping: https://www.espncricinfo.com/story/plays-of-the-da

Scraping: https://www.espncricinfo.com/story/anderson-benefitting-from-zaheer-kumble-739695
Scraping: https://www.espncricinfo.com/story/rajasthan-6-4-in-royal-rumble-against-csk-738793
Scraping: https://www.espncricinfo.com/story/a-privilege-to-work-with-kirsten-pietersen-736557
Scraping: https://www.espncricinfo.com/story/hard-to-get-over-world-t20-defeat-says-yuvraj-singh-737669
Scraping: https://www.espncricinfo.com/story/uthappa-to-remain-in-opener-s-slot-for-kkr-trevor-bayliss-741865
Scraping: https://www.espncricinfo.com/story/suryakumar-yadav-could-play-for-india-one-day-virender-sehwag-742841
Scraping: https://www.espncricinfo.com/story/sunil-gavaskar-to-head-bcci-during-ipl-csk-and-rr-can-play-731955
Scraping: https://www.espncricinfo.com/story/fresh-start-for-daredevils-and-pietersen-736733
Scraping: https://www.espncricinfo.com/story/dhoni-can-lead-india-for-three-four-years-fleming-741507
Scraping: https://www.espncricinfo.com/story/super-kings-hit-on-pitch-perfect-strateg

Scraping: https://www.espncricinfo.com/story/daryll-cullinan-is-t20-giving-fielding-short-shrift-745379
Scraping: https://www.espncricinfo.com/story/munaf-patel-mystified-by-ipl-exclusion-721091
Scraping: https://www.espncricinfo.com/story/jacques-kallis-aging-but-still-agile-737623
Scraping: https://www.espncricinfo.com/story/the-run-out-that-changed-the-game-748929
Scraping: https://www.espncricinfo.com/story/virat-kohli-s-d-j-vu-743295
Scraping: https://www.espncricinfo.com/story/maxwell-clobbers-89-to-help-mohali-beat-jaipur-738493
Scraping: https://www.espncricinfo.com/story/the-duck-tales-738197
Scraping: https://www.espncricinfo.com/story/royals-retain-samson-binny-rahane-watson-faulkner-708569
Scraping: https://www.espncricinfo.com/story/kkr-s-bang-it-in-plan-goes-awry-741313
Scraping: https://www.espncricinfo.com/story/manan-vohra-s-misplaced-boot-746511
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-sunrisers-hyderabad-ipl-sharjah-740157
Scra

Scraping: https://www.espncricinfo.com/story/kings-xi-s-middle-overs-strength-and-the-narine-factor-749187
Scraping: https://www.espncricinfo.com/story/rcb-is-a-place-i-can-express-myself-yuvraj-718573
Scraping: https://www.espncricinfo.com/story/pollard-plays-the-air-guitar-747599
Scraping: https://www.espncricinfo.com/story/v-ramnarayan-the-pravin-tambe-fairytale-742517
Scraping: https://www.espncricinfo.com/story/lalit-modi-on-the-ipl-innovations-that-weren-t-744583
Scraping: https://www.espncricinfo.com/story/niggles-for-mumbai-amid-big-name-hype-736725
Scraping: https://www.espncricinfo.com/story/delhi-bring-in-tahir-lose-saurabh-tiwary-742301
Scraping: https://www.espncricinfo.com/story/fan-following-chennai-super-kings-v-mumbai-indians-ipl-dubai-739671
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-left-arm-spinner-1-kp-0-745877
Scraping: https://www.espncricinfo.com/story/runs-and-sixes-galore-but-grief-for-fast-bowlers-749901
Scraping: https://www.espncricinfo.c

Scraping: https://www.espncricinfo.com/story/ben-stokes-and-co-face-ipl-dilemma-707777
Scraping: https://www.espncricinfo.com/story/all-round-ravindra-jadeja-stars-in-chennai-win-739223
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-stupendous-pollard-745777
Scraping: https://www.espncricinfo.com/story/knight-riders-retain-gambhir-and-narine-708715
Scraping: https://www.espncricinfo.com/story/a-turnaround-built-on-hurt-belief-and-a-strong-leader-749979
Scraping: https://www.espncricinfo.com/story/chinnaswamy-stadium-s-ticket-mystery-741875
Scraping: https://www.espncricinfo.com/story/bcci-should-do-better-to-restore-fans-faith-harsha-bhogle-747029
Scraping: https://www.espncricinfo.com/story/bowling-confines-csk-to-below-par-third-749027
Scraping: https://www.espncricinfo.com/story/royals-want-many-kkr-two-daredevils-unsure-708409
Scraping: https://www.espncricinfo.com/story/sanjay-bangar-has-brought-the-best-out-of-youngsters-george-bailey-744061
Scraping: https://www.e

In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2015.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2015_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/r-rajkumar-bayliss-to-learn-english-for-new-job-881633
Scraping: https://www.espncricinfo.com/story/arun-venugopal-nostalgic-in-kolkata-881695
Scraping: https://www.espncricinfo.com/story/andrew-hughes-an-ipl-review-without-having-watched-a-game-881423
Scraping: https://www.espncricinfo.com/story/the-southee-and-nair-trick-in-pune-881317
Scraping: https://www.espncricinfo.com/story/how-well-do-you-know-ipl-2015-881381
Scraping: https://www.espncricinfo.com/story/an-ipl-for-openers-and-big-hitters-881175
Scraping: https://www.espncricinfo.com/story/skillful-super-kings-wobble-at-the-final-hurdle-880941
Scraping: https://www.espncricinfo.com/story/the-assisted-hokey-pokey-and-bravo-zone-881079
Scraping: https://www.espncricinfo.com/story/mumbai-thrill-with-typical-bouncebackability-880943
Scraping: https://www.espncricinfo.com/story/ab-s-shy-exhibition-and-pollard-the-punisher-881083
Scraping: https://www.espncricinfo.com/story/going-against-t

Scraping: https://www.espncricinfo.com/story/karn-sharma-version-2-0-875719
Scraping: https://www.espncricinfo.com/story/pandya-pollard-keep-mumbai-in-the-hunt-875613
Scraping: https://www.espncricinfo.com/story/fuloria-pandya-pollard-pathan-power-875949
Scraping: https://www.espncricinfo.com/story/alex-hales-replaces-corey-anderson-for-mumbai-875705
Scraping: https://www.espncricinfo.com/story/bhuvneshwar-writing-unusual-script-as-india-pacer-875139
Scraping: https://www.espncricinfo.com/story/the-ipl-s-un-awarded-868517
Scraping: https://www.espncricinfo.com/story/striking-it-hot-in-the-ipl-873869
Scraping: https://www.espncricinfo.com/story/hendricks-axar-down-royal-challengers-875205
Scraping: https://www.espncricinfo.com/story/gollapudi-saha-hendricks-silence-rcb-875361
Scraping: https://www.espncricinfo.com/story/injured-kevin-pietersen-abandons-sunrisers-return-875257
Scraping: https://www.espncricinfo.com/story/let-down-kevin-pietersen-heads-for-ipl-874653
Scraping: https://www

Scraping: https://www.espncricinfo.com/story/zaheer-s-dream-return-and-thakur-s-nightmare-debut-869097
Scraping: https://www.espncricinfo.com/story/ajinkya-rahane-if-you-dominate-you-go-to-the-next-level-868123
Scraping: https://www.espncricinfo.com/story/simmons-luck-and-suchith-s-mixed-day-869273
Scraping: https://www.espncricinfo.com/story/two-paced-pitches-and-one-sided-games-869135
Scraping: https://www.espncricinfo.com/story/coulter-nile-zaheer-rout-kings-xi-868983
Scraping: https://www.espncricinfo.com/story/uthappa-determined-to-be-kkr-s-pillar-868841
Scraping: https://www.espncricinfo.com/story/halfway-trends-numbers-that-have-defined-each-ipl-team-868489
Scraping: https://www.espncricinfo.com/story/russell-uthappa-power-knight-riders-home-868617
Scraping: https://www.espncricinfo.com/story/krishnaswamy-knight-riders-turn-back-the-clock-to-2014-formula-868749
Scraping: https://www.espncricinfo.com/story/mccullum-s-riposte-and-ten-doeschate-s-stunner-868643
Scraping: https://ww

Scraping: https://www.espncricinfo.com/story/butter-fingered-knight-riders-and-an-economical-sunil-narine-863089
Scraping: https://www.espncricinfo.com/story/kings-xi-dance-to-russell-rap-863171
Scraping: https://www.espncricinfo.com/story/proactive-and-hands-on-duminy-takes-charge-863119
Scraping: https://www.espncricinfo.com/story/russell-special-sinks-kings-xi-863133
Scraping: https://www.espncricinfo.com/story/duminy-stars-in-thrilling-daredevils-victory-862715
Scraping: https://www.espncricinfo.com/story/abhishek-purohit-top-order-woes-mask-mumbai-s-bowling-conundrum-862647
Scraping: https://www.espncricinfo.com/story/super-kings-rough-and-tumble-harbhajan-out-of-the-hat-862561
Scraping: https://www.espncricinfo.com/story/i-am-wicketkeeper-first-then-batsman-saha-862437
Scraping: https://www.espncricinfo.com/story/smith-mccullum-steamroll-mumbai-indians-862583
Scraping: https://www.espncricinfo.com/story/middle-order-bluntness-holds-sunrisers-back-862533
Scraping: https://www.espn

Scraping: https://www.espncricinfo.com/story/bcci-tells-officials-to-remain-tough-on-chucking-858393
Scraping: https://www.espncricinfo.com/story/sunrisers-reliant-on-imported-muscle-858351
Scraping: https://www.espncricinfo.com/story/sunil-narine-cleared-to-bowl-in-ipl-858373
Scraping: https://www.espncricinfo.com/story/less-pressure-on-our-key-batsmen-now-kohli-858357
Scraping: https://www.espncricinfo.com/story/underachievers-rcb-look-for-substance-over-style-858255
Scraping: https://www.espncricinfo.com/story/rohit-sharma-seeks-room-for-on-field-banter-858333
Scraping: https://www.espncricinfo.com/story/ipl-giants-seek-to-end-title-drought-858241
Scraping: https://www.espncricinfo.com/story/knight-riders-relent-as-narine-appears-for-retest-858197
Scraping: https://www.espncricinfo.com/story/world-cup-performers-missing-at-the-ipl-858225
Scraping: https://www.espncricinfo.com/story/kane-richardson-withdraws-from-ipl-858179
Scraping: https://www.espncricinfo.com/story/alfonso-thomas-

In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2017.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2017_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/an-all-time-ipl-xi-chosen-by-our-readers-1098724
Scraping: https://www.espncricinfo.com/story/jarrod-kimber-where-will-t20-cricket-go-next-1099221
Scraping: https://www.espncricinfo.com/story/srinath-sripath-watches-the-ipl-final-with-fans-in-a-pub-1099097
Scraping: https://www.espncricinfo.com/story/how-a-hotel-takes-care-of-ipl-teams-1098971
Scraping: https://www.espncricinfo.com/story/quiz-how-well-do-you-remember-the-2017-ipl-1099124
Scraping: https://www.espncricinfo.com/story/young-indian-players-carve-new-rising-pune-supergiant-identity-1098902
Scraping: https://www.espncricinfo.com/story/four-overs-three-bowlers-one-pulsating-finish-how-bumrah-malinga-and-johnson-won-it-for-mumbai-1098982
Scraping: https://www.espncricinfo.com/story/mumbai-indians-bench-strength-bosses-the-ipl-1098924
Scraping: https://www.espncricinfo.com/story/spinners-make-powerplay-impact-1099007
Scraping: https://www.espncricinfo.com/story/krunal-pandya-mitchell

Scraping: https://www.espncricinfo.com/story/all-time-ipl-xi-the-final-nominees-1097008
Scraping: https://www.espncricinfo.com/story/ipl-2017-who-will-be-future-stars-for-india-1096996
Scraping: https://www.espncricinfo.com/story/shikhar-dhawan-moises-henriques-keep-hyderabad-s-playoff-hopes-alive-1096946
Scraping: https://www.espncricinfo.com/story/rahul-tewatia-s-legspin-turns-kings-xi-s-chances-1097076
Scraping: https://www.espncricinfo.com/story/partnerships-of-the-week-sunil-narine-chris-lynn-floor-bangalore-with-awe-inspiring-display-1096813
Scraping: https://www.espncricinfo.com/story/weekly-ipl-social-media-round-up-1096909
Scraping: https://www.espncricinfo.com/story/smith-kishan-stand-dents-punjab-s-hopes-of-making-playoffs-1096811
Scraping: https://www.espncricinfo.com/story/mohammad-nabi-s-accuracy-handcuffs-mumbai-indians-1096929
Scraping: https://www.espncricinfo.com/story/narine-lynn-script-win-over-bangalore-with-attacking-display-1096812
Scraping: https://www.espncrici

Scraping: https://www.espncricinfo.com/story/robin-uthappa-gautam-gambhir-help-kolkata-grab-top-spot-1094833
Scraping: https://www.espncricinfo.com/story/net-run-rate-deficit-sparked-urgency-for-aaron-finch-in-gujarat-lions-chase-1094984
Scraping: https://www.espncricinfo.com/story/corey-anderson-i-ve-got-to-progress-my-game-instead-of-trying-to-please-people-1094727
Scraping: https://www.espncricinfo.com/story/aaron-finch-s-brutish-masterclass-1094954
Scraping: https://www.espncricinfo.com/story/rana-credits-batting-success-to-gambhir-s-advice-1094671
Scraping: https://www.espncricinfo.com/story/mid-season-review-rising-pune-soaring-mumbai-sinking-rcb-1094712
Scraping: https://www.espncricinfo.com/story/jarrod-kimber-how-to-build-the-ideal-t20-side-1094594
Scraping: https://www.espncricinfo.com/story/have-immense-faith-in-my-batting-robin-uthappa-1094836
Scraping: https://www.espncricinfo.com/story/robin-uthappa-lives-the-batsman-s-dream-1094798
Scraping: https://www.espncricinfo.com/

Scraping: https://www.espncricinfo.com/story/kieron-pollard-krunal-pandya-impress-with-93-run-stand-1092299
Scraping: https://www.espncricinfo.com/story/the-narine-gamble-and-uthappa-s-innovations-1092432
Scraping: https://www.espncricinfo.com/story/dwayne-smith-brendon-mccullum-lead-gujarat-to-first-win-of-season-1092305
Scraping: https://www.espncricinfo.com/story/knuckle-ball-my-most-powerful-weapon-in-t20s-andrew-tye-1092307
Scraping: https://www.espncricinfo.com/story/all-time-ipl-xi-the-allrounders-1092517
Scraping: https://www.espncricinfo.com/story/some-luck-and-uthappa-s-sublime-timing-1092425
Scraping: https://www.espncricinfo.com/story/ben-laughlin-i-stopped-caring-about-results-and-really-enjoyed-the-game-1092318
Scraping: https://www.espncricinfo.com/story/corey-anderson-blasts-it-beyond-kings-xi-punjab-s-reach-1092496
Scraping: https://www.espncricinfo.com/story/andrew-tye-s-knuckle-ball-punches-wind-out-of-pune-1092259
Scraping: https://www.espncricinfo.com/story/having-

Scraping: https://www.espncricinfo.com/story/kings-xi-punjab-replace-injured-m-vijay-with-ishant-sharma-1089984
Scraping: https://www.espncricinfo.com/story/kings-xi-punjab-reboot-hinged-on-indian-bowlers-1089897
Scraping: https://www.espncricinfo.com/story/top-heavy-gujarat-lions-look-to-outdo-debut-success-1089757
Scraping: https://www.espncricinfo.com/story/big-brand-big-impact-10-years-of-the-ipl-1089994
Scraping: https://www.espncricinfo.com/story/the-tamasha-in-all-its-glory-ten-things-the-ipl-has-brought-to-the-mainstream-1089881
Scraping: https://www.espncricinfo.com/story/pace-heavy-delhi-daredevils-look-to-change-fortunes-1089445
Scraping: https://www.espncricinfo.com/story/domestic-batting-crew-key-to-kolkata-knight-riders-success-1089537
Scraping: https://www.espncricinfo.com/story/shane-watson-to-stand-in-as-royal-challengers-captain-1089896
Scraping: https://www.espncricinfo.com/story/the-playalike-contest-faqs-1089869
Scraping: https://www.espncricinfo.com/story/defendin

Scraping: https://www.espncricinfo.com/story/rcb-and-mitchell-starc-part-ways-ahead-of-2017-season-1083303
Scraping: https://www.espncricinfo.com/story/arun-venugopal-and-gaurav-sundaraman-how-teams-prepare-for-the-ipl-auction-1083327
Scraping: https://www.espncricinfo.com/story/steven-smith-replaces-ms-dhoni-as-pune-supergiants-captain-1083286
Scraping: https://www.espncricinfo.com/story/mohammad-kaif-appointed-deputy-to-gujarat-lions-head-coach-brad-hodge-for-2017-ipl-season-1082905
Scraping: https://www.espncricinfo.com/story/the-big-ipl-auction-poll-1082826
Scraping: https://www.espncricinfo.com/story/ipl-player-auction-new-faces-and-where-they-could-end-up-1082695
Scraping: https://www.espncricinfo.com/story/indore-returns-as-ipl-venue-final-in-hyderabad-on-may-21-1082581
Scraping: https://www.espncricinfo.com/story/ipl-2017-player-auction-how-the-teams-stand-and-whom-they-might-buy-1082439
Scraping: https://www.espncricinfo.com/story/south-africa-england-players-likely-to-leave-i

In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2018.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2018_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/the-ten-fastest-fifties-in-ipl-history-1086140
Scraping: https://www.espncricinfo.com/story/the-greatest-ipl-performances-no-2-shane-watson-117-not-out-vs-the-sunrisers-hyderabad-1258682
Scraping: https://www.espncricinfo.com/story/the-greatest-ipl-performances-no-4-rashid-khan-3-for-19-and-34-not-out-vs-the-kolkata-knight-riders-1257858
Scraping: https://www.espncricinfo.com/story/with-age-batting-down-the-order-was-like-quicksand-ms-dhoni-1149163
Scraping: https://www.espncricinfo.com/story/sanjay-manjrekar-ipl-2018-competitive-yes-high-quality-no-1148298
Scraping: https://www.espncricinfo.com/story/aakash-chopra-why-india-will-rely-heavily-on-jasprit-bumrah-in-the-coming-season-1147746
Scraping: https://www.espncricinfo.com/story/vote-for-your-ipl-2018-team-of-the-tournament-1148237
Scraping: https://www.espncricinfo.com/story/stats-ambati-rayudu-shane-watson-s-most-prolific-ipl-seasons-1148064
Scraping: https://www.espncricinfo.com/story

Scraping: https://www.espncricinfo.com/story/lanning-raj-goswami-to-feature-in-women-s-exhibition-t20-1146890
Scraping: https://www.espncricinfo.com/story/from-tests-to-t20s-kane-williamson-masters-his-own-range-1146936
Scraping: https://www.espncricinfo.com/story/jasprit-bumrah-the-master-of-death-over-stifles-1146829
Scraping: https://www.espncricinfo.com/story/talking-points-kl-rahul-or-bust-for-kings-xi-1146813
Scraping: https://www.espncricinfo.com/story/ipl-2018-qualification-scenarios-what-if-mumbai-indians-beat-kings-xi-punjab-1146766
Scraping: https://www.espncricinfo.com/story/how-does-kkr-s-win-against-royals-impact-playoff-qualification-1146682
Scraping: https://www.espncricinfo.com/story/ipl-qualification-scenarios-rcb-mumbai-keep-pressure-on-kkr-royals-1146603
Scraping: https://www.espncricinfo.com/story/confidence-and-wickets-return-to-kuldeep-yadav-s-repertoire-1146736
Scraping: https://www.espncricinfo.com/story/jos-buttler-as-opener-does-half-the-job-for-rajasthan-roy

Scraping: https://www.espncricinfo.com/story/kane-williamson-gives-sunrisers-the-extra-gear-1144792
Scraping: https://www.espncricinfo.com/story/remodelled-chris-lynn-delivers-another-win-1144825
Scraping: https://www.espncricinfo.com/story/rcb-s-plans-go-awry-as-narine-lynn-live-on-the-edge-1144841
Scraping: https://www.espncricinfo.com/story/shreyas-iyer-takes-on-the-wristspinners-as-planned-1144615
Scraping: https://www.espncricinfo.com/story/talking-points-four-balls-that-changed-the-game-1144734
Scraping: https://www.espncricinfo.com/story/mumbai-indians-ride-on-the-rohit-sharma-factor-1144725
Scraping: https://www.espncricinfo.com/story/what-makes-sunrisers-hyderabad-s-bowling-attack-so-good-1144738
Scraping: https://www.espncricinfo.com/story/will-royal-challengers-bangalore-find-the-spark-their-season-needs-1144690
Scraping: https://www.espncricinfo.com/story/junior-dala-replaces-injured-chris-morris-at-delhi-daredevils-1144571
Scraping: https://www.espncricinfo.com/story/advis

Scraping: https://www.espncricinfo.com/story/shakib-al-hasan-gives-sunrisers-all-round-edge-1143309
Scraping: https://www.espncricinfo.com/story/mumbai-rue-bad-luck-and-a-poor-finish-1143293
Scraping: https://www.espncricinfo.com/story/injured-kamlesh-nagarkoti-ruled-out-of-ipl-season-1143238
Scraping: https://www.espncricinfo.com/story/kohli-and-ashwin-s-contrasting-use-of-spin-1143220
Scraping: https://www.espncricinfo.com/story/vishal-dikshit-third-seamer-in-focus-for-mumbai-after-two-narrow-losses-1143205
Scraping: https://www.espncricinfo.com/story/how-the-t20-stars-fare-against-the-yorker-1143183
Scraping: https://www.espncricinfo.com/story/rejuvenated-umesh-yadav-breathes-fire-1143216
Scraping: https://www.espncricinfo.com/story/talking-points-sunrisers-hyderabad-nearly-throw-it-away-1143109
Scraping: https://www.espncricinfo.com/story/suresh-raina-to-miss-next-two-csk-games-with-calf-injury-1143034
Scraping: https://www.espncricinfo.com/story/gutted-that-we-have-to-move-from-ch

Scraping: https://www.espncricinfo.com/story/jsw-sports-buys-50-stake-in-delhi-daredevils-1139543
Scraping: https://www.espncricinfo.com/story/gautam-gambhir-returns-to-delhi-daredevils-as-captain-1139262
Scraping: https://www.espncricinfo.com/story/dinesh-karthik-to-captain-kkr-in-ipl-2018-1138738
Scraping: https://www.espncricinfo.com/story/r-ashwin-to-captain-kings-xi-punjab-1137998
Scraping: https://www.espncricinfo.com/story/i-ll-never-be-predictable-r-ashwin-1138016
Scraping: https://www.espncricinfo.com/story/no-surgery-required-for-chris-lynn-s-shoulder-injury-1137617
Scraping: https://www.espncricinfo.com/story/poll-who-should-be-team-captains-in-the-ipl-1137200
Scraping: https://www.espncricinfo.com/story/mumbai-to-play-chennai-super-kings-in-ipl-season-opener-1136560
Scraping: https://www.espncricinfo.com/story/shane-warne-returns-to-rajasthan-royals-as-mentor-1136394
Scraping: https://www.espncricinfo.com/story/pune-demands-to-host-ipl-play-offs-1136307
Scraping: https://ww

In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2019.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2019_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/talking-points-why-kings-xi-were-unusually-aggressive-in-the-powerplay-1181563
Scraping: https://www.espncricinfo.com/story/imran-tahir-shows-spin-still-rules-at-eden-gardens-1180937
Scraping: https://www.espncricinfo.com/story/who-will-be-the-biggest-buy-at-ipl-2019-auction-1169141
Scraping: https://www.espncricinfo.com/story/talking-points-kl-rahul-s-slow-starts-r-ashwin-s-management-of-bowlers-1182570
Scraping: https://www.espncricinfo.com/story/williamson-to-lead-sunrisers-despite-warner-s-availability-1178705
Scraping: https://www.espncricinfo.com/story/ms-dhoni-fined-50-of-match-fees-for-umpiring-outburst-1180678
Scraping: https://www.espncricinfo.com/story/shimron-hetmyer-oshane-thomas-could-set-off-bidding-wars-in-the-ipl-auction-1169016
Scraping: https://www.espncricinfo.com/story/i-know-i-m-special-andre-russell-1180133
Scraping: https://www.espncricinfo.com/story/turner-set-for-further-shoulder-surgery-to-boost-australia-chances-1

Scraping: https://www.espncricinfo.com/story/martin-guptill-or-billy-stanlake-who-after-david-warner-for-sunrisers-hyderabad-1182801
Scraping: https://www.espncricinfo.com/story/smart-stats-pant-286-trumps-rahane-183-5-1181779
Scraping: https://www.espncricinfo.com/story/road-to-ipl-playoffs-is-paved-by-the-bowlers-1183418
Scraping: https://www.espncricinfo.com/story/seven-wickets-for-eight-runs-the-great-delhi-capitulation-1179707
Scraping: https://www.espncricinfo.com/story/ipl-2019-auction-how-the-teams-stack-up-for-next-season-1169297
Scraping: https://www.espncricinfo.com/story/talking-points-bumrah-swings-see-saw-game-mumbai-s-way-1179234
Scraping: https://www.espncricinfo.com/story/hardik-pandya-has-a-great-chance-of-world-cup-success-yuvraj-singh-1183183
Scraping: https://www.espncricinfo.com/story/super-kings-left-to-seek-more-batting-support-for-ms-dhoni-1181688
Scraping: https://www.espncricinfo.com/story/how-archer-and-unadkat-beat-the-ipl-s-best-batting-side-in-the-death-1

Scraping: https://www.espncricinfo.com/story/with-the-bat-and-in-the-mind-steven-smith-is-happy-where-things-are-at-1181561
Scraping: https://www.espncricinfo.com/story/mitchell-starc-released-from-ipl-amid-australia-contracts-debate-1165299
Scraping: https://www.espncricinfo.com/story/talking-points-ajinkya-rahane-s-dream-t20-innings-1181749
Scraping: https://www.espncricinfo.com/story/the-super-kings-ride-again-but-it-might-be-time-for-some-fresh-blood-1184342
Scraping: https://www.espncricinfo.com/story/royal-challengers-bangalore-s-hits-kings-xi-punjab-s-misses-in-the-last-three-overs-1181982
Scraping: https://www.espncricinfo.com/story/ipl-2019-mid-season-review-where-the-teams-stand-and-what-they-need-to-do-1181110
Scraping: https://www.espncricinfo.com/story/i-am-no-slouch-in-white-ball-cricket-r-ashwin-1178257
Scraping: https://www.espncricinfo.com/story/capitals-strong-indian-core-weak-overseas-underbelly-1184054
Scraping: https://www.espncricinfo.com/story/aussies-overseas-wa

Scraping: https://www.espncricinfo.com/story/live-blog-ipl-2019-auction-1166897
Scraping: https://www.espncricinfo.com/story/what-earned-matt-kelly-ipl-attention-1180689
Scraping: https://www.espncricinfo.com/story/shreyas-gopal-puts-a-new-spin-on-his-skills-1181306
Scraping: https://www.espncricinfo.com/story/varun-shetty-delhi-capitals-and-the-madness-of-youth-1183833
Scraping: https://www.espncricinfo.com/story/chepauk-choke-chennai-super-kings-game-plan-a-throwback-to-2011-1180482
Scraping: https://www.espncricinfo.com/story/south-africa-coach-ottis-gibson-wanted-world-cup-players-to-make-early-return-from-ipl-1184005
Scraping: https://www.espncricinfo.com/story/hamilton-boys-santner-and-kuggeleijn-tussle-to-fill-bravo-sized-hole-1180801
Scraping: https://www.espncricinfo.com/story/ms-dhoni-s-on-field-outburst-probably-not-right-jos-buttler-1180700
Scraping: https://www.espncricinfo.com/story/quiz-whose-records-did-curran-and-joseph-better-1180461
Scraping: https://www.espncricinfo

Scraping: https://www.espncricinfo.com/story/superstats-ashwin-or-harbhajan-whose-contribution-was-bigger-1180177
Scraping: https://www.espncricinfo.com/story/football-s-var-system-could-end-no-ball-farce-1179282
Scraping: https://www.espncricinfo.com/story/these-games-mess-with-your-mind-we-were-off-on-a-number-of-fronts-stephen-fleming-1183352
Scraping: https://www.espncricinfo.com/story/shane-watson-repays-csk-s-faith-with-memorable-assault-1181882
Scraping: https://www.espncricinfo.com/story/dhoni-concerned-by-low-scoring-chepauk-pitch-1180493
Scraping: https://www.espncricinfo.com/story/nitish-rana-yearns-to-make-good-on-another-promising-start-1179201
Scraping: https://www.espncricinfo.com/story/australia-players-to-have-only-limited-ipl-role-ahead-of-world-cup-1165432
Scraping: https://www.espncricinfo.com/story/ipl-s-el-clasico-rohit-sharma-s-best-laid-plans-v-ms-dhoni-s-instincts-1183478
Scraping: https://www.espncricinfo.com/story/bhuvneshwar-kumar-attacks-hardik-pandya-s-wea

Scraping: https://www.espncricinfo.com/story/talking-points-the-ishan-kishan-run-out-explained-1180197
Scraping: https://www.espncricinfo.com/story/don-t-care-if-i-m-judged-on-not-winning-the-ipl-virat-kohli-1178668
Scraping: https://www.espncricinfo.com/story/jos-buttler-dismissal-deemed-legal-by-mcc-amid-scrutiny-of-the-laws-1179046
Scraping: https://www.espncricinfo.com/story/conflict-of-interest-sachin-tendulkar-holds-bcci-responsible-for-the-situation-1183338
Scraping: https://www.espncricinfo.com/story/west-indies-changes-give-pollard-world-cup-hope-1180581
Scraping: https://www.espncricinfo.com/story/aussies-overseas-smith-times-his-run-pattinson-knocks-over-maxwell-1181793
Scraping: https://www.espncricinfo.com/story/scenarios-how-the-mumbai-sunrisers-result-will-affect-the-ipl-playoff-race-1182973
Scraping: https://www.espncricinfo.com/story/paddy-upton-appointed-rajasthan-royals-coach-1171625
Scraping: https://www.espncricinfo.com/story/ipl-central-kieron-pollard-fined-for-sh

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Replace these with actual selectors from the website you're scraping
        title = soup.select_one('h1')  # Update if the title tag is different
        date = soup.select_one('.article-meta span')  # Update with the correct selector for the date
        content = "\n".join([p.text.strip() for p in soup.select('p')])  # Assuming paragraphs are wrapped in <p>

        return {
            "url": url,
            "title": title.text.strip() if title else "N/A",
            "date": date.text.strip() if date else "N/A",
            "content": content if content else "N/A"
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": None, "date": None, "content": None}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2023.csv"  # Update with your actual file name
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles_2023_titles.csv"

# Verify file existence
if not os.path.exists(input_file):
    print(f"File not found: {input_file}")
    exit()

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/espncricinfo-awards-men-s-t20-leagues-bowling-winner-yuzvendra-chahal-4-for-29-vs-sunrisers-hyderabad-jaipur-1416817
Scraping: https://www.espncricinfo.com/story/devon-conway-playing-in-high-intensity-games-has-allowed-me-to-be-clearer-in-different-t20-scenarios-1381242
Scraping: https://www.espncricinfo.com/story/jitesh-sharma-i-want-it-to-come-down-to-me-to-finish-matches-so-i-can-crack-20-off-ten-balls-ipl-2023-1380599
Scraping: https://www.espncricinfo.com/story/what-india-england-and-australia-can-learn-from-ms-dhoni-as-a-big-test-summer-begins-mark-nicholas-1379875
Scraping: https://www.espncricinfo.com/story/stump-mic-podcast-a-wish-list-for-indian-stadiums-and-india-s-t20i-xi-after-ipl-1379636
Scraping: https://www.espncricinfo.com/story/ms-dhoni-undergoes-knee-surgery-in-mumbai-1379332
Scraping: https://www.espncricinfo.com/story/ben-stokes-ipl-gymwork-john-terry-at-csk-2023-1379256
Scraping: https://www.espncricinfo.com/story/ms-dh

Scraping: https://www.espncricinfo.com/story/ipl-rcb-virat-kohli-passes-chris-gayle-for-most-hundreds-in-the-ipl-1377575
Scraping: https://www.espncricinfo.com/story/middle-order-woes-rear-its-ugly-head-as-rcb-go-home-emptyhanded-again-1377583
Scraping: https://www.espncricinfo.com/story/ipl-2023-rcb-vs-gt-virat-kohli-i-am-playing-my-best-t20-cricket-again-1377535
Scraping: https://www.espncricinfo.com/story/rohit-sharma-if-we-don-t-qualify-for-playoffs-we-have-only-ourselves-to-blame-1377476
Scraping: https://www.espncricinfo.com/story/stats-mi-s-season-of-200-plus-totals-cameron-green-s-rapid-century-1377477
Scraping: https://www.espncricinfo.com/story/ipl-2023-unsettled-unit-confusing-selection-calls-make-it-another-season-to-forget-for-sunrisers-hyderabad-1377483
Scraping: https://www.espncricinfo.com/story/ipl-2023-hardik-pandya-is-neymar-as-neymar-could-rarely-be-1377390
Scraping: https://www.espncricinfo.com/story/ipl-2023-rain-threatens-crucial-rcb-vs-gujarat-titans-game-in-ben

Scraping: https://www.espncricinfo.com/story/yuzvendra-chahal-keeps-his-style-of-wristspin-alive-while-being-himself-1374831
Scraping: https://www.espncricinfo.com/story/ms-dhoni-shivam-dube-cameos-highlight-csks-no-holds-barred-approach-1374828
Scraping: https://www.espncricinfo.com/story/ipl-2023-what-do-kkr-and-rr-need-to-do-to-qualify-for-the-playoffs-1374816
Scraping: https://www.espncricinfo.com/story/meet-nehal-wadhera-the-batter-from-punjab-who-is-lighting-up-ipl-2023-for-mumbai-indians-1374664
Scraping: https://www.espncricinfo.com/story/ipl-2023-batters-on-top-a-relentless-run-fest-with-no-signs-of-slowdown-1374632
Scraping: https://www.espncricinfo.com/story/trent-boult-as-a-bowler-i-ve-always-thought-i-ve-got-full-control-of-dictating-the-play-1374418
Scraping: https://www.espncricinfo.com/story/ipl-2023-mumbai-indians-show-off-their-batting-might-one-200-plus-total-at-a-time-1374616
Scraping: https://www.espncricinfo.com/story/ipl-2023-mi-vs-rcb-du-plessis-disappointed-wit

Scraping: https://www.espncricinfo.com/story/lucknow-super-giants-smash-the-second-highest-total-in-ipl-history-1371957
Scraping: https://www.espncricinfo.com/story/ricky-ponting-it-was-a-tough-decision-to-leave-prithvi-shaw-out-1371880
Scraping: https://www.espncricinfo.com/story/ravi-shastri-backs-ajinkya-rahane-s-inclusion-in-wtc-final-squad-1371824
Scraping: https://www.espncricinfo.com/story/ipl-2023-srh-batter-harry-brooks-struggle-to-prove-he-belongs-in-the-ipl-1371797
Scraping: https://www.espncricinfo.com/story/ipl-2023-yashasvi-jaiswal-takes-a-step-towards-becoming-a-complete-batter-1371786
Scraping: https://www.espncricinfo.com/story/virat-kohli-at-rcb-witnessing-a-hero-in-the-flesh-in-ipl-2023-1371644
Scraping: https://www.espncricinfo.com/story/hamstring-injury-rules-washington-sundar-out-of-ipl-2023-1371621
Scraping: https://www.espncricinfo.com/story/ipl-2023-half-time-report-csk-and-titans-fly-but-mid-table-logjam-leaves-everything-to-play-for-1371609
Scraping: https://

Scraping: https://www.espncricinfo.com/story/ipl-2023-nicholas-pooran-to-rishabh-pant-that-first-step-you-take-thats-when-you-become-motivated-1368687
Scraping: https://www.espncricinfo.com/story/ipl-2023-week-2-podcast-from-rinku-singh-s-fireworks-to-more-ms-dhoni-magic-1368677
Scraping: https://www.espncricinfo.com/story/ipl-2023-mohit-sharma-puts-on-a-stirring-show-for-gujarat-titans-against-punjab-kings-1368621
Scraping: https://www.espncricinfo.com/story/ipl-2023-kkr-how-amateur-suyash-sharma-signed-up-for-kkr-magical-mystery-spin-tour-1368462
Scraping: https://www.espncricinfo.com/story/ipl-shane-watson-i-will-be-blown-away-if-david-warner-doesnt-set-the-ipl-alight-1368474
Scraping: https://www.espncricinfo.com/story/csk-vs-rr-ipl-2023-r-ashwin-criticises-umpiring-decisions-in-ipl-have-left-me-flummoxed-1368452
Scraping: https://www.espncricinfo.com/story/csk-vs-rr-ipl-2023-r-ashwin-crashes-chennai-super-kings-party-to-silence-chepauk-1368448
Scraping: https://www.espncricinfo.co

Scraping: https://www.espncricinfo.com/story/ipl-2023-rcb-batter-rajat-patidar-ruled-out-with-heel-injury-1366833
Scraping: https://www.espncricinfo.com/story/ipl-2023-ms-dhoni-no-balls-and-wides-hurting-chennai-super-kings-really-bad-1366807
Scraping: https://www.espncricinfo.com/story/lsg-coach-morne-morkel-sees-mark-wood-and-kyle-mayers-a-bonus-in-ipl-2023-1366788
Scraping: https://www.espncricinfo.com/story/ipl-2023-ms-dhoni-and-chennai-super-kings-recreate-old-chepauk-magic-1366772
Scraping: https://www.espncricinfo.com/story/does-mark-wood-hold-the-record-for-the-best-figures-on-ipl-debut-ask-steven-1366588
Scraping: https://www.espncricinfo.com/story/aussies-overseas-ipl-and-county-cricket-smith-labuschagne-hazlewood-green-warner-1365275
Scraping: https://www.espncricinfo.com/story/kkr-allrounder-shakib-al-hasan-opts-out-of-ipl-2023-1366681
Scraping: https://www.espncricinfo.com/story/ipl-2023-the-josh-little-journey-from-pembroke-under-11s-to-the-ipl-bright-lights-1366645
Scrap

Scraping: https://www.espncricinfo.com/story/englands-harry-brook-on-ashes-ipl-mlb-and-kevin-pietersen-1363738
Scraping: https://www.espncricinfo.com/story/david-warner-to-captain-delhi-capitals-in-ipl-2023-1363737
Scraping: https://www.espncricinfo.com/story/rcb-allrounder-will-jacks-ruled-out-of-ipl-2023-due-to-injury-1363679
Scraping: https://www.espncricinfo.com/story/punjab-kings-sweating-on-jonny-bairstow-availability-for-ipl-2023-1361192
Scraping: https://www.espncricinfo.com/story/jhye-richardson-out-of-ipl-2023-likely-to-miss-ashes-too-1363009
Scraping: https://www.espncricinfo.com/story/sa-players-kagiso-rabada-david-miller-aiden-markram-to-join-ipl-on-april-3-1362396
Scraping: https://www.espncricinfo.com/story/india-news-jasprit-bumrah-undergoes-back-surgery-plan-is-to-have-him-ready-in-time-for-2023-odi-world-cup-1362296
Scraping: https://www.espncricinfo.com/story/players-can-review-on-field-wide-and-no-ball-decisions-in-wpl-ipl-2023-1361902
Scraping: https://www.espncric

Scraping: https://www.espncricinfo.com/story/ipl-auction-set-for-december-23-in-kochi-1344093
Scraped data saved to /Users/hemantg/Desktop/dl-projectscraped_articles_2023_titles.csv


In [17]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span.ds-text-tight-xs.ds-text-typo-mid3')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        content = soup.select('.ci-html-content')
        full_content = "\n".join([paragraph.get_text(strip=True) for paragraph in content])

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content if content else "N/A",
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2018.csv"  # Replace with your actual input file path
output_file = "/Users/hemantg/Desktop/dl-projectscraped_articles-2018-content.csv"  # Replace with your desired output file path

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/the-ten-fastest-fifties-in-ipl-history-1086140
Scraping: https://www.espncricinfo.com/story/the-greatest-ipl-performances-no-2-shane-watson-117-not-out-vs-the-sunrisers-hyderabad-1258682
Scraping: https://www.espncricinfo.com/story/the-greatest-ipl-performances-no-4-rashid-khan-3-for-19-and-34-not-out-vs-the-kolkata-knight-riders-1257858
Scraping: https://www.espncricinfo.com/story/with-age-batting-down-the-order-was-like-quicksand-ms-dhoni-1149163
Scraping: https://www.espncricinfo.com/story/sanjay-manjrekar-ipl-2018-competitive-yes-high-quality-no-1148298
Scraping: https://www.espncricinfo.com/story/aakash-chopra-why-india-will-rely-heavily-on-jasprit-bumrah-in-the-coming-season-1147746
Scraping: https://www.espncricinfo.com/story/vote-for-your-ipl-2018-team-of-the-tournament-1148237
Scraping: https://www.espncricinfo.com/story/stats-ambati-rayudu-shane-watson-s-most-prolific-ipl-seasons-1148064
Scraping: https://www.espncricinfo.com/story

Scraping: https://www.espncricinfo.com/story/lanning-raj-goswami-to-feature-in-women-s-exhibition-t20-1146890
Scraping: https://www.espncricinfo.com/story/from-tests-to-t20s-kane-williamson-masters-his-own-range-1146936
Scraping: https://www.espncricinfo.com/story/jasprit-bumrah-the-master-of-death-over-stifles-1146829
Scraping: https://www.espncricinfo.com/story/talking-points-kl-rahul-or-bust-for-kings-xi-1146813
Scraping: https://www.espncricinfo.com/story/ipl-2018-qualification-scenarios-what-if-mumbai-indians-beat-kings-xi-punjab-1146766
Scraping: https://www.espncricinfo.com/story/how-does-kkr-s-win-against-royals-impact-playoff-qualification-1146682
Scraping: https://www.espncricinfo.com/story/ipl-qualification-scenarios-rcb-mumbai-keep-pressure-on-kkr-royals-1146603
Scraping: https://www.espncricinfo.com/story/confidence-and-wickets-return-to-kuldeep-yadav-s-repertoire-1146736
Scraping: https://www.espncricinfo.com/story/jos-buttler-as-opener-does-half-the-job-for-rajasthan-roy

Scraping: https://www.espncricinfo.com/story/kane-williamson-gives-sunrisers-the-extra-gear-1144792
Scraping: https://www.espncricinfo.com/story/remodelled-chris-lynn-delivers-another-win-1144825
Scraping: https://www.espncricinfo.com/story/rcb-s-plans-go-awry-as-narine-lynn-live-on-the-edge-1144841
Scraping: https://www.espncricinfo.com/story/shreyas-iyer-takes-on-the-wristspinners-as-planned-1144615
Scraping: https://www.espncricinfo.com/story/talking-points-four-balls-that-changed-the-game-1144734
Scraping: https://www.espncricinfo.com/story/mumbai-indians-ride-on-the-rohit-sharma-factor-1144725
Scraping: https://www.espncricinfo.com/story/what-makes-sunrisers-hyderabad-s-bowling-attack-so-good-1144738
Scraping: https://www.espncricinfo.com/story/will-royal-challengers-bangalore-find-the-spark-their-season-needs-1144690
Scraping: https://www.espncricinfo.com/story/junior-dala-replaces-injured-chris-morris-at-delhi-daredevils-1144571
Scraping: https://www.espncricinfo.com/story/advis

Scraping: https://www.espncricinfo.com/story/shakib-al-hasan-gives-sunrisers-all-round-edge-1143309
Scraping: https://www.espncricinfo.com/story/mumbai-rue-bad-luck-and-a-poor-finish-1143293
Scraping: https://www.espncricinfo.com/story/injured-kamlesh-nagarkoti-ruled-out-of-ipl-season-1143238
Scraping: https://www.espncricinfo.com/story/kohli-and-ashwin-s-contrasting-use-of-spin-1143220
Scraping: https://www.espncricinfo.com/story/vishal-dikshit-third-seamer-in-focus-for-mumbai-after-two-narrow-losses-1143205
Scraping: https://www.espncricinfo.com/story/how-the-t20-stars-fare-against-the-yorker-1143183
Scraping: https://www.espncricinfo.com/story/rejuvenated-umesh-yadav-breathes-fire-1143216
Scraping: https://www.espncricinfo.com/story/talking-points-sunrisers-hyderabad-nearly-throw-it-away-1143109
Scraping: https://www.espncricinfo.com/story/suresh-raina-to-miss-next-two-csk-games-with-calf-injury-1143034
Scraping: https://www.espncricinfo.com/story/gutted-that-we-have-to-move-from-ch

Scraping: https://www.espncricinfo.com/story/jsw-sports-buys-50-stake-in-delhi-daredevils-1139543
Scraping: https://www.espncricinfo.com/story/gautam-gambhir-returns-to-delhi-daredevils-as-captain-1139262
Scraping: https://www.espncricinfo.com/story/dinesh-karthik-to-captain-kkr-in-ipl-2018-1138738
Scraping: https://www.espncricinfo.com/story/r-ashwin-to-captain-kings-xi-punjab-1137998
Scraping: https://www.espncricinfo.com/story/i-ll-never-be-predictable-r-ashwin-1138016
Scraping: https://www.espncricinfo.com/story/no-surgery-required-for-chris-lynn-s-shoulder-injury-1137617
Scraping: https://www.espncricinfo.com/story/poll-who-should-be-team-captains-in-the-ipl-1137200
Scraping: https://www.espncricinfo.com/story/mumbai-to-play-chennai-super-kings-in-ipl-season-opener-1136560
Scraping: https://www.espncricinfo.com/story/shane-warne-returns-to-rajasthan-royals-as-mentor-1136394
Scraping: https://www.espncricinfo.com/story/pune-demands-to-host-ipl-play-offs-1136307
Scraping: https://ww

In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        # Look for paragraphs within the main content div
        content_div = soup.select_one('div.ds-px-4')  # Ensure we target the correct container first
        if content_div:
            content_paragraphs = content_div.select('p.ds-text-comfortable-l.ci-html-content')  # Target paragraphs
            full_content = "\n".join([p.get_text(strip=True) for p in content_paragraphs]) if content_paragraphs else "N/A"
        else:
            full_content = "N/A"

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content,
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# File paths
input_file = "/Users/hemantg/Desktop/dl-project-data-scraping-2023.csv"  # Replace with your actual input file path
output_file = "/Users/hemantg/Desktop/dl-project-scraped_articles_2023_dates.csv"  # Replace with your desired output file path

# Read the CSV file (assuming it's a single column without a header)
df_links = pd.read_csv(input_file, header=None, names=["url"])

# Scrape each URL
scraped_data = []
for url in df_links["url"]:
    print(f"Scraping: {url}")
    article_data = scrape_article(url)
    scraped_data.append(article_data)

# Save the scraped data to a new CSV file
df_scraped = pd.DataFrame(scraped_data)
df_scraped.to_csv(output_file, index=False)

print(f"Scraped data saved to {output_file}")

Scraping: https://www.espncricinfo.com/story/espncricinfo-awards-men-s-t20-leagues-bowling-winner-yuzvendra-chahal-4-for-29-vs-sunrisers-hyderabad-jaipur-1416817
Scraping: https://www.espncricinfo.com/story/devon-conway-playing-in-high-intensity-games-has-allowed-me-to-be-clearer-in-different-t20-scenarios-1381242
Scraping: https://www.espncricinfo.com/story/jitesh-sharma-i-want-it-to-come-down-to-me-to-finish-matches-so-i-can-crack-20-off-ten-balls-ipl-2023-1380599
Scraping: https://www.espncricinfo.com/story/what-india-england-and-australia-can-learn-from-ms-dhoni-as-a-big-test-summer-begins-mark-nicholas-1379875
Scraping: https://www.espncricinfo.com/story/stump-mic-podcast-a-wish-list-for-indian-stadiums-and-india-s-t20i-xi-after-ipl-1379636
Scraping: https://www.espncricinfo.com/story/ms-dhoni-undergoes-knee-surgery-in-mumbai-1379332
Scraping: https://www.espncricinfo.com/story/ben-stokes-ipl-gymwork-john-terry-at-csk-2023-1379256
Scraping: https://www.espncricinfo.com/story/ms-dh

Scraping: https://www.espncricinfo.com/story/ipl-rcb-virat-kohli-passes-chris-gayle-for-most-hundreds-in-the-ipl-1377575
Scraping: https://www.espncricinfo.com/story/middle-order-woes-rear-its-ugly-head-as-rcb-go-home-emptyhanded-again-1377583
Scraping: https://www.espncricinfo.com/story/ipl-2023-rcb-vs-gt-virat-kohli-i-am-playing-my-best-t20-cricket-again-1377535
Scraping: https://www.espncricinfo.com/story/rohit-sharma-if-we-don-t-qualify-for-playoffs-we-have-only-ourselves-to-blame-1377476
Scraping: https://www.espncricinfo.com/story/stats-mi-s-season-of-200-plus-totals-cameron-green-s-rapid-century-1377477
Scraping: https://www.espncricinfo.com/story/ipl-2023-unsettled-unit-confusing-selection-calls-make-it-another-season-to-forget-for-sunrisers-hyderabad-1377483
Scraping: https://www.espncricinfo.com/story/ipl-2023-hardik-pandya-is-neymar-as-neymar-could-rarely-be-1377390
Scraping: https://www.espncricinfo.com/story/ipl-2023-rain-threatens-crucial-rcb-vs-gujarat-titans-game-in-ben

Scraping: https://www.espncricinfo.com/story/yuzvendra-chahal-keeps-his-style-of-wristspin-alive-while-being-himself-1374831
Scraping: https://www.espncricinfo.com/story/ms-dhoni-shivam-dube-cameos-highlight-csks-no-holds-barred-approach-1374828
Scraping: https://www.espncricinfo.com/story/ipl-2023-what-do-kkr-and-rr-need-to-do-to-qualify-for-the-playoffs-1374816
Scraping: https://www.espncricinfo.com/story/meet-nehal-wadhera-the-batter-from-punjab-who-is-lighting-up-ipl-2023-for-mumbai-indians-1374664
Scraping: https://www.espncricinfo.com/story/ipl-2023-batters-on-top-a-relentless-run-fest-with-no-signs-of-slowdown-1374632
Scraping: https://www.espncricinfo.com/story/trent-boult-as-a-bowler-i-ve-always-thought-i-ve-got-full-control-of-dictating-the-play-1374418
Scraping: https://www.espncricinfo.com/story/ipl-2023-mumbai-indians-show-off-their-batting-might-one-200-plus-total-at-a-time-1374616
Scraping: https://www.espncricinfo.com/story/ipl-2023-mi-vs-rcb-du-plessis-disappointed-wit

Scraping: https://www.espncricinfo.com/story/lucknow-super-giants-smash-the-second-highest-total-in-ipl-history-1371957
Scraping: https://www.espncricinfo.com/story/ricky-ponting-it-was-a-tough-decision-to-leave-prithvi-shaw-out-1371880
Scraping: https://www.espncricinfo.com/story/ravi-shastri-backs-ajinkya-rahane-s-inclusion-in-wtc-final-squad-1371824
Scraping: https://www.espncricinfo.com/story/ipl-2023-srh-batter-harry-brooks-struggle-to-prove-he-belongs-in-the-ipl-1371797
Scraping: https://www.espncricinfo.com/story/ipl-2023-yashasvi-jaiswal-takes-a-step-towards-becoming-a-complete-batter-1371786
Scraping: https://www.espncricinfo.com/story/virat-kohli-at-rcb-witnessing-a-hero-in-the-flesh-in-ipl-2023-1371644
Scraping: https://www.espncricinfo.com/story/hamstring-injury-rules-washington-sundar-out-of-ipl-2023-1371621
Scraping: https://www.espncricinfo.com/story/ipl-2023-half-time-report-csk-and-titans-fly-but-mid-table-logjam-leaves-everything-to-play-for-1371609
Scraping: https://

Scraping: https://www.espncricinfo.com/story/ipl-2023-nicholas-pooran-to-rishabh-pant-that-first-step-you-take-thats-when-you-become-motivated-1368687
Scraping: https://www.espncricinfo.com/story/ipl-2023-week-2-podcast-from-rinku-singh-s-fireworks-to-more-ms-dhoni-magic-1368677
Scraping: https://www.espncricinfo.com/story/ipl-2023-mohit-sharma-puts-on-a-stirring-show-for-gujarat-titans-against-punjab-kings-1368621
Scraping: https://www.espncricinfo.com/story/ipl-2023-kkr-how-amateur-suyash-sharma-signed-up-for-kkr-magical-mystery-spin-tour-1368462
Scraping: https://www.espncricinfo.com/story/ipl-shane-watson-i-will-be-blown-away-if-david-warner-doesnt-set-the-ipl-alight-1368474
Scraping: https://www.espncricinfo.com/story/csk-vs-rr-ipl-2023-r-ashwin-criticises-umpiring-decisions-in-ipl-have-left-me-flummoxed-1368452
Scraping: https://www.espncricinfo.com/story/csk-vs-rr-ipl-2023-r-ashwin-crashes-chennai-super-kings-party-to-silence-chepauk-1368448
Scraping: https://www.espncricinfo.co

Scraping: https://www.espncricinfo.com/story/ipl-2023-rcb-batter-rajat-patidar-ruled-out-with-heel-injury-1366833
Scraping: https://www.espncricinfo.com/story/ipl-2023-ms-dhoni-no-balls-and-wides-hurting-chennai-super-kings-really-bad-1366807
Scraping: https://www.espncricinfo.com/story/lsg-coach-morne-morkel-sees-mark-wood-and-kyle-mayers-a-bonus-in-ipl-2023-1366788
Scraping: https://www.espncricinfo.com/story/ipl-2023-ms-dhoni-and-chennai-super-kings-recreate-old-chepauk-magic-1366772
Scraping: https://www.espncricinfo.com/story/does-mark-wood-hold-the-record-for-the-best-figures-on-ipl-debut-ask-steven-1366588
Scraping: https://www.espncricinfo.com/story/aussies-overseas-ipl-and-county-cricket-smith-labuschagne-hazlewood-green-warner-1365275
Scraping: https://www.espncricinfo.com/story/kkr-allrounder-shakib-al-hasan-opts-out-of-ipl-2023-1366681
Scraping: https://www.espncricinfo.com/story/ipl-2023-the-josh-little-journey-from-pembroke-under-11s-to-the-ipl-bright-lights-1366645
Scrap

Scraping: https://www.espncricinfo.com/story/englands-harry-brook-on-ashes-ipl-mlb-and-kevin-pietersen-1363738
Scraping: https://www.espncricinfo.com/story/david-warner-to-captain-delhi-capitals-in-ipl-2023-1363737
Scraping: https://www.espncricinfo.com/story/rcb-allrounder-will-jacks-ruled-out-of-ipl-2023-due-to-injury-1363679
Scraping: https://www.espncricinfo.com/story/punjab-kings-sweating-on-jonny-bairstow-availability-for-ipl-2023-1361192
Scraping: https://www.espncricinfo.com/story/jhye-richardson-out-of-ipl-2023-likely-to-miss-ashes-too-1363009
Scraping: https://www.espncricinfo.com/story/sa-players-kagiso-rabada-david-miller-aiden-markram-to-join-ipl-on-april-3-1362396
Scraping: https://www.espncricinfo.com/story/india-news-jasprit-bumrah-undergoes-back-surgery-plan-is-to-have-him-ready-in-time-for-2023-odi-world-cup-1362296
Scraping: https://www.espncricinfo.com/story/players-can-review-on-field-wide-and-no-ball-decisions-in-wpl-ipl-2023-1361902
Scraping: https://www.espncric

Scraping: https://www.espncricinfo.com/story/ipl-auction-set-for-december-23-in-kochi-1344093
Scraped data saved to /Users/hemantg/Desktop/dl-project-scraped_articles_2023_dates.csv


In [2]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape individual articles
def scrape_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.select_one('h1.ds-text-title-l.ds-font-bold')
        title_text = title.get_text(strip=True) if title else "N/A"

        # Extract the date
        date = soup.select_one('div[data-behavior="date_time"] span')
        date_text = date.get_text(strip=True) if date else "N/A"

        # Extract the one-line summary
        summary = soup.select_one('p')
        summary_text = summary.get_text(strip=True) if summary else "N/A"

        # Extract the main content
        content_div = soup.select_one('div.ds-px-4')  # Ensure we target the correct container first
        if content_div:
            content_paragraphs = content_div.select('p.ds-text-comfortable-l.ci-html-content')  # Target paragraphs
            full_content = "\n".join([p.get_text(strip=True) for p in content_paragraphs]) if content_paragraphs else "N/A"
        else:
            full_content = "N/A"

        return {
            "url": url,
            "title": title_text,
            "date": date_text,
            "summary": summary_text,
            "content": full_content,
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {"url": url, "title": "Error", "date": "Error", "summary": "Error", "content": "Error"}

# Base paths for input and output
base_input_path = "/Users/hemantg/Desktop/"
base_output_path = "/Users/hemantg/Desktop/"

# Loop through years from 2009 to 2023
for year in range(2008, 2024):
    input_file = os.path.join(base_input_path, f"dl-project-data-scraping-{year}.csv")
    output_file = os.path.join(base_output_path, f"dl-project-scraped_articles_rp-{year}_dates.csv")

    # Check if the input file exists
    if not os.path.exists(input_file):
        print(f"Input file for {year} does not exist. Skipping...")
        continue

    # Read the CSV file (assuming it's a single column without a header)
    df_links = pd.read_csv(input_file, header=None, names=["url"])

    # Scrape each URL
    scraped_data = []
    for url in df_links["url"]:
        print(f"Scraping: {url}")
        article_data = scrape_article(url)
        scraped_data.append(article_data)

    # Save the scraped data to a new CSV file
    df_scraped = pd.DataFrame(scraped_data)
    df_scraped.to_csv(output_file, index=False)

    print(f"Scraped data for {year} saved to {output_file}")

Input file for 2008 does not exist. Skipping...
Scraping: https://www.espncricinfo.com/story/faqs-the-indian-premier-league-337868
Scraping: https://www.espncricinfo.com/story/warne-still-the-best-612366
Scraping: https://www.espncricinfo.com/story/time-for-a-reality-check-612364
Scraping: https://www.espncricinfo.com/story/management-skills-learnt-from-the-ipl-612440
Scraping: https://www.espncricinfo.com/story/cricket-lovely-cricket-612466
Scraping: https://www.espncricinfo.com/story/ipl-auction-tentatively-set-for-january-29-369935
Scraping: https://www.espncricinfo.com/story/bloodaxe-insists-it-was-not-my-show-612949
Scraping: https://www.espncricinfo.com/story/first-year-figures-spell-success-for-franchises-371471
Scraping: https://www.espncricinfo.com/story/gilchrist-to-lead-deccan-chargers-371655
Scraping: https://www.espncricinfo.com/story/laxman-removal-unfair-371715
Scraping: https://www.espncricinfo.com/story/sign-of-things-to-come-613024
Scraping: https://www.espncricinfo.c

Scraping: https://www.espncricinfo.com/story/i-really-did-not-expect-it-duminy-389528
Scraping: https://www.espncricinfo.com/story/no-takers-for-haddin-kapugedera-614279
Scraping: https://www.espncricinfo.com/story/no-buyers-for-shakib-al-hasan-614286
Scraping: https://www.espncricinfo.com/story/the-moments-and-the-math-389495
Scraping: https://www.espncricinfo.com/story/player-auction-to-begin-at-10-45-am-614275
Scraping: https://www.espncricinfo.com/story/tyron-henderson-goes-to-rajasthan-for-650-000-614282
Scraping: https://www.espncricinfo.com/story/proud-mortaza-credits-hard-work-389524
Scraping: https://www.espncricinfo.com/story/pietersen-and-flintoff-sell-for-1-55-million-each-614278
Scraping: https://www.espncricinfo.com/story/ryder-goes-to-bangalore-mills-to-mumbai-614285
Scraping: https://www.espncricinfo.com/story/flintoff-and-pietersen-most-expensive-buys-389466
Scraping: https://www.espncricinfo.com/story/ipl-success-driven-by-quality-not-just-money-614274
Scraping: https

Scraping: https://www.espncricinfo.com/story/ipl-schedule-likely-to-be-finalised-next-week-396052
Scraping: https://www.espncricinfo.com/story/negotiations-betwen-ipl-and-sony-fail-396020
Scraping: https://www.espncricinfo.com/story/in-a-league-of-its-own-614736
Scraping: https://www.espncricinfo.com/story/bcci-calls-for-emergency-meeting-on-ipl-396183
Scraping: https://www.espncricinfo.com/story/reports-of-ipl-move-to-south-africa-denied-396100
Scraping: https://www.espncricinfo.com/story/point-taken-614749
Scraping: https://www.espncricinfo.com/story/pietersen-to-lead-bangalore-royal-challengers-396253
Scraping: https://www.espncricinfo.com/story/we-are-ready-to-host-ipl-csa-396333
Scraping: https://www.espncricinfo.com/story/franchises-back-venue-switch-396377
Scraping: https://www.espncricinfo.com/story/ipl-to-be-played-outside-india-396319
Scraping: https://www.espncricinfo.com/story/buchanan-still-hopeful-of-india-hosting-ipl-396396
Scraping: https://www.espncricinfo.com/story/en

Scraping: https://www.espncricinfo.com/story/lost-on-commercial-street-400053
Scraping: https://www.espncricinfo.com/story/profiteers-at-large-614929
Scraping: https://www.espncricinfo.com/story/ipl-seek-compromise-in-box-row-399242
Scraping: https://www.espncricinfo.com/story/prasad-keen-to-learn-from-chennai-s-foreign-stars-399230
Scraping: https://www.espncricinfo.com/story/dirty-harry-to-be-at-ipl-opening-ceremony-614935
Scraping: https://www.espncricinfo.com/story/warne-plots-repeat-of-ipl-success-614934
Scraping: https://www.espncricinfo.com/story/ipl-can-help-kp-work-on-poor-twenty20-skills-614943
Scraping: https://www.espncricinfo.com/story/warne-regrets-losing-watson-and-tanvir-399293
Scraping: https://www.espncricinfo.com/story/the-sixes-don-t-add-up-399274
Scraping: https://www.espncricinfo.com/story/looking-back-looking-forward-399184
Scraping: https://www.espncricinfo.com/story/famous-in-india-400146
Scraping: https://www.espncricinfo.com/story/coachie-coo-397777
Scraping:

Scraping: https://www.espncricinfo.com/story/pietersen-warned-for-dissent-400850
Scraping: https://www.espncricinfo.com/story/old-and-beautiful-400712
Scraping: https://www.espncricinfo.com/story/in-defence-of-cheerleaders-400742
Scraping: https://www.espncricinfo.com/story/playing-hot-staying-cool-400849
Scraping: https://www.espncricinfo.com/story/making-fools-of-viewers-strategically-615015
Scraping: https://www.espncricinfo.com/story/a-question-of-jesse-400705
Scraping: https://www.espncricinfo.com/story/deck-the-halls-with-boughs-of-lolly-398859
Scraping: https://www.espncricinfo.com/story/chappell-leapfrogs-to-second-place-400774
Scraping: https://www.espncricinfo.com/story/gilchrist-lends-weight-to-opening-statements-401035
Scraping: https://www.espncricinfo.com/story/adventures-of-a-co-pilot-400905
Scraping: https://www.espncricinfo.com/story/ipl-brings-the-good-the-bad-and-modi-to-cape-town-615030
Scraping: https://www.espncricinfo.com/story/finding-humour-in-rumour-615034
Scr

Scraping: https://www.espncricinfo.com/story/bangalore-win-but-can-t-hide-the-flaws-402183
Scraping: https://www.espncricinfo.com/story/sharp-nicks-400934
Scraping: https://www.espncricinfo.com/story/the-eloquent-silence-of-sourav-ganguly-401969
Scraping: https://www.espncricinfo.com/story/kulasekara-called-up-by-kings-xi-punjab-402042
Scraping: https://www.espncricinfo.com/story/bowled-over-by-confidence-402196
Scraping: https://www.espncricinfo.com/story/warm-that-bench-lads-402024
Scraping: https://www.espncricinfo.com/story/chin-up-kolkata-402238
Scraping: https://www.espncricinfo.com/story/warm-that-bench-lads-402024
Scraping: https://www.espncricinfo.com/story/bal-inches-closer-402192
Scraping: https://www.espncricinfo.com/story/punjab-look-to-keep-the-winning-habit-402359
Scraping: https://www.espncricinfo.com/story/ipl-as-a-launch-pad-615084
Scraping: https://www.espncricinfo.com/story/the-ipl-movie-unexpected-hair-and-a-team-voyeur-402229
Scraping: https://www.espncricinfo.com

Scraping: https://www.espncricinfo.com/story/look-beyond-the-stars-615140
Scraping: https://www.espncricinfo.com/story/watson-bracken-and-hopes-out-of-ipl-403314
Scraping: https://www.espncricinfo.com/story/rajasthan-s-amit-singh-reported-for-suspect-action-403550
Scraping: https://www.espncricinfo.com/story/manjrekar-makes-it-to-no-3-403530
Scraping: https://www.espncricinfo.com/story/i-value-my-wicket-more-than-ever-403566
Scraping: https://www.espncricinfo.com/story/what-s-to-love-about-the-ipl-615148
Scraping: https://www.espncricinfo.com/story/never-mind-the-break-403546
Scraping: https://www.espncricinfo.com/story/a-braai-in-the-park-403623
Scraping: https://www.espncricinfo.com/story/arthur-backs-cricket-australia-s-decision-403563
Scraping: https://www.espncricinfo.com/story/wanted-drama-403541
Scraping: https://www.espncricinfo.com/story/resurgent-teams-look-to-continue-winning-403590
Scraping: https://www.espncricinfo.com/story/just-chillin-402292
Scraping: https://www.espncr

Scraping: https://www.espncricinfo.com/story/deccan-seek-to-complete-perfect-weekend-404655
Scraping: https://www.espncricinfo.com/story/a-quiet-day-at-the-top-404653
Scraping: https://www.espncricinfo.com/story/they-re-playing-our-song-400072
Scraping: https://www.espncricinfo.com/story/sehwag-targets-return-to-batting-form-404777
Scraping: https://www.espncricinfo.com/story/south-africa-bowling-coach-joins-deccan-chargers-404694
Scraping: https://www.espncricinfo.com/story/amit-singh-s-bowling-action-cleared-404776
Scraping: https://www.espncricinfo.com/story/mccullum-reprimanded-for-dissent-404721
Scraping: https://www.espncricinfo.com/story/the-other-amla-404691
Scraping: https://www.espncricinfo.com/story/canny-spinners-keep-punjab-alive-404741
Scraping: https://www.espncricinfo.com/story/enough-with-the-friendliness-404818
Scraping: https://www.espncricinfo.com/story/twist-in-the-tale-404897
Scraping: https://www.espncricinfo.com/story/shaz-pom-ali-and-co-404793
Scraping: https:/

Scraping: https://www.espncricinfo.com/story/endorsements-get-a-shake-up-405833
Scraping: https://www.espncricinfo.com/story/twenty-thoughts-on-the-ipl-405910
Scraping: https://www.espncricinfo.com/story/rookies-a-blog-a-dog-and-sreesanth-405971
Scraping: https://www.espncricinfo.com/story/all-pain-no-gain-615278
Scraping: https://www.espncricinfo.com/story/of-endings-and-other-things-405930
Scraping: https://www.espncricinfo.com/story/handling-twenty20-domination-needs-new-mindset-majola-405991
Scraping: https://www.espncricinfo.com/story/packed-wanderers-enjoys-the-battle-405950
Scraping: https://www.espncricinfo.com/story/bitter-for-kumble-sweet-for-gilchrist-615274
Scraping: https://www.espncricinfo.com/story/thirteen-from-two-405927
Scraping: https://www.espncricinfo.com/story/gilchrist-reflects-on-a-job-well-done-405989
Scraping: https://www.espncricinfo.com/story/parting-is-such-sweet-sorrow-406002
Scraping: https://www.espncricinfo.com/story/meeting-manish-405852
Scraping: http

Scraping: https://www.espncricinfo.com/story/hedonic-regression-and-the-ipl-616252
Scraping: https://www.espncricinfo.com/story/whatmore-to-be-kolkata-coach-429319
Scraping: https://www.espncricinfo.com/story/clarke-keen-to-play-ipl-next-year-430369
Scraping: https://www.espncricinfo.com/story/the-ipl-at-a-theatre-near-you-616388
Scraping: https://www.espncricinfo.com/story/ipl-to-trial-pink-balls-in-practice-games-433990
Scraping: https://www.espncricinfo.com/story/cricket-and-generation-xbox-616484
Scraping: https://www.espncricinfo.com/story/ipl-revises-terms-for-overseas-players-434799
Scraping: https://www.espncricinfo.com/story/ipl-sets-november-20-deadline-for-pakistan-players-434764
Scraping: https://www.espncricinfo.com/story/pakistan-players-close-to-ipl-return-434906
Scraping: https://www.espncricinfo.com/story/the-pcb-has-not-recevied-any-invitation-letters-from-ipl-436043
Scraping: https://www.espncricinfo.com/story/syed-sahabuddin-joins-mumbai-indians-436152
Scraping: htt

Scraping: https://www.espncricinfo.com/story/pakistan-sports-minister-demands-inquiry-into-ipl-snub-445956
Scraping: https://www.espncricinfo.com/story/cartoon-guide-to-spin-445930
Scraping: https://www.espncricinfo.com/story/abdul-razzaq-linked-to-kolkata-knight-riders-446045
Scraping: https://www.espncricinfo.com/story/modi-and-the-messy-pakistan-affair-617018
Scraping: https://www.espncricinfo.com/story/the-british-summer-pastime-that-is-the-ipl-617015
Scraping: https://www.espncricinfo.com/story/in-this-game-of-greed-no-one-is-above-board-617012
Scraping: https://www.espncricinfo.com/story/ian-chappell-why-modi-is-like-sehwag-446349
Scraping: https://www.espncricinfo.com/story/players-associations-frustrated-by-ipl-rebuff-over-security-446508
Scraping: https://www.espncricinfo.com/story/cartoon-squillionaire-446482
Scraping: https://www.espncricinfo.com/story/indian-home-minister-guarantees-player-security-446535
Scraping: https://www.espncricinfo.com/story/players-want-access-to-s

Scraping: https://www.espncricinfo.com/story/ipl-2010-bangalore-bank-on-batsmen-and-india-contingent-451135
Scraping: https://www.espncricinfo.com/story/dhoni-and-fleming-place-faith-in-local-talent-in-ipl-451162
Scraping: https://www.espncricinfo.com/story/brad-hodge-likely-to-open-for-kolkata-451126
Scraping: https://www.espncricinfo.com/story/deccan-chargers-bank-on-batting-for-an-encore-451241
Scraping: https://www.espncricinfo.com/story/deccan-chargers-bank-on-batting-for-an-encore-451241
Scraping: https://www.espncricinfo.com/story/ipl-won-t-grow-beyond-seven-weeks-modi-451193
Scraping: https://www.espncricinfo.com/story/shane-warne-gives-thumbs-up-for-new-ipl-franchises-451235
Scraping: https://www.espncricinfo.com/story/island-effect-on-the-ipl-451337
Scraping: https://www.espncricinfo.com/story/delhi-daredevils-hope-to-get-third-time-lucky-451228
Scraping: https://www.espncricinfo.com/story/tendulkar-rewinds-time-at-the-bombay-gymkhana-451310
Scraping: https://www.espncricinfo

Scraping: https://www.espncricinfo.com/story/chennai-super-kings-hit-by-ms-dhoni-injury-452270
Scraping: https://www.espncricinfo.com/story/delhi-daredevils-v-mumbai-indians-delhi-452324
Scraping: https://www.espncricinfo.com/story/with-great-paean-617323
Scraping: https://www.espncricinfo.com/story/plays-of-the-day-praveen-swings-warnie-sinks-452420
Scraping: https://www.espncricinfo.com/story/dr-doosra-s-demented-devices-the-goggly-452279
Scraping: https://www.espncricinfo.com/story/the-many-faces-of-sachin-tendulkar-452325
Scraping: https://www.espncricinfo.com/story/jolted-delhi-daredevils-look-for-consistency-452407
Scraping: https://www.espncricinfo.com/story/ahmedabad-takes-steps-to-curb-moth-menace-452372
Scraping: https://www.espncricinfo.com/story/gautam-gambhir-ruled-out-for-chennai-game-452416
Scraping: https://www.espncricinfo.com/story/why-the-ipl-shouldn-t-suffer-our-censure-617330
Scraping: https://www.espncricinfo.com/story/punjab-seek-first-win-against-favourite-oppon

Scraping: https://www.espncricinfo.com/story/end-of-ipl-for-damien-martyn-453427
Scraping: https://www.espncricinfo.com/story/silly-season-has-arrived-in-india-617384
Scraping: https://www.espncricinfo.com/story/royal-challengers-bangalore-v-delhi-daredevils-ipl-2010-bangalore-453486
Scraping: https://www.espncricinfo.com/story/chennai-s-bare-fast-bowling-cupboard-453522
Scraping: https://www.espncricinfo.com/story/buoyant-rajasthan-aim-to-rein-in-deccan-453500
Scraping: https://www.espncricinfo.com/story/sidharth-monga-when-danny-met-pandee-453411
Scraping: https://www.espncricinfo.com/story/harbhajan-v-hayden-453517
Scraping: https://www.espncricinfo.com/story/amit-mishra-stays-classical-in-twenty20-453499
Scraping: https://www.espncricinfo.com/story/kedar-jadhav-helps-delhi-end-losing-streak-453482
Scraping: https://www.espncricinfo.com/story/shikhar-dhawan-propels-mumbai-s-chase-453503
Scraping: https://www.espncricinfo.com/story/tanya-aldred-ipl-greedy-or-good-453025
Scraping: htt

KeyboardInterrupt: 