# Scrap Proxy Ips

In [6]:
import requests
from bs4 import BeautifulSoup

# URL of the website
url = "https://free-proxy-list.net/"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

ip_table = soup.find("table", class_="table table-striped table-bordered")

ip_addresses = []

rows = ip_table.find_all("tr")[1:]

# Iterate over the first 100 rows to get the IPs
for row in rows[:10]:
    ip_address = row.find("td").text
    ip_addresses.append(ip_address)

# Append IP addresses to the existing file
with open("ips.txt", "w") as file:
    for ip_address in ip_addresses:
        file.write(ip_address + "\n")



# Filter the valid ips

In [4]:
import requests
from concurrent.futures import ThreadPoolExecutor

def check_proxy(proxy):
    try:
        ip, port = proxy.split(':')
        response = requests.get('http://ipinfo.io', proxies={'http': f'http://{ip}:{port}', 'https': f'http://{ip}:{port}'}, timeout=5)
        if response.status_code == 200:
            print(f"{proxy} is valid and working")
            return proxy
        else:
            print(f"{proxy} is invalid")
            return None
    except Exception as e:
        print(f"Error checking {proxy}: {e}")
        return None

def save_to_file(proxies, filename):
    with open(filename, 'w') as f:
        for proxy in proxies:
            f.write(proxy + '\n')
    print(f"Saved {len(proxies)} valid proxies to {filename}")

def main():
    with open('ips.txt', 'r') as f:
        proxies = f.read().splitlines()

    with ThreadPoolExecutor(max_workers=20) as executor:
        valid_proxies = list(filter(None, executor.map(check_proxy, proxies)))

    print("Valid proxies:", valid_proxies)
    save_to_file(valid_proxies, 'valid-proxies.txt')

if __name__ == "__main__":
    main()


154.6.98.23:3128 is valid and working
38.62.221.4:3128 is valid and working
154.6.96.102:3128 is valid and working
154.6.97.217:3128 is valid and working
154.6.97.53:3128 is valid and working
154.6.97.49:3128 is valid and working
38.62.220.194:3128 is valid and working
154.6.96.45:3128 is valid and working
154.6.98.132:3128 is valid and working
154.6.98.203:3128 is valid and working
154.6.98.148:3128 is valid and working
38.62.221.74:3128 is valid and working
154.6.99.91:3128 is valid and working
38.62.223.189:3128 is valid and working
38.62.221.140:3128 is valid and working
38.62.221.47:3128 is valid and working
38.62.222.108:3128 is valid and working
154.6.97.73:3128 is valid and working
38.62.221.155:3128 is valid and working
38.62.223.192:3128 is valid and working
154.6.98.41:3128 is valid and working
154.6.98.222:3128 is valid and working
38.62.221.49:3128 is valid and working
154.6.97.112:3128 is valid and working
38.62.221.176:3128 is valid and working
38.62.223.109:3128 is vali

# Extract the links of the articles from pagination

In [15]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Function to read proxy IPs from a file
def read_proxy_ips(file_path):
    with open(file_path, "r") as file:
        proxy_ips = [line.strip() for line in file if line.strip()]
    return proxy_ips

# Function to fetch article links from a page
def fetch_article_links(url, proxy):
    try:
        response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            article_section = soup.find("section", class_="editorial river")
            if article_section:
                article_links = []
                for article_element in article_section.find_all("a", class_="card-item__link"):
                    article_link = article_element.get("href")
                    full_article_link = "https://www.gamespot.com" + article_link
                    article_links.append(full_article_link)
                return article_links
    except Exception as e:
        print(f"Error fetching article links from {url} using proxy {proxy}: {e}")
    return []

#Main function to fetch article links from multiple pages with pagination
def main():
    base_url = "https://www.gamespot.com/games/reviews/?page={}"
    proxy_ips = read_proxy_ips("valid-proxies.txt")  #Assuming proxy IPs are stored in validIps.txt

    with open("article_links.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Article Link", "Proxy IP"])

        for page in range(1, 126):  
            print(f"Fetching page {page}...")
            page_url = base_url.format(page)
            for proxy in proxy_ips:
                article_links = fetch_article_links(page_url, proxy)
                for article_link in article_links:
                    writer.writerow([article_link, proxy])
                if article_links:  
                    break
            time.sleep(2)  

if __name__ == "__main__":
    main()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Fetching 

# Sample of a single page

In [None]:
import requests
from bs4 import BeautifulSoup

# Website link
url = "https://www.gamespot.com/reviews/like-a-dragon-gaiden-review-the-man-who-cant-escape-the-yakuza/1900-6418151/"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the URL of the image
    image_url = soup.find("div", class_="kubrick-lead")["style"].split("url(")[1].split(")")[0]

    # Extract the title
    title = soup.find("h1", class_="kubrick-info__title").text.strip()

    # Extract the content represented by paragraph tags within the specified section
    article_section = soup.find("section", class_="article-body typography-format")
    paragraphs = article_section.find_all("p")
    content = "\n".join([p.get_text(strip=True) for p in paragraphs])

    # Extract review rating
    review_rating_container = soup.find("div", class_="review-breakdown__score-container")
    review_rating = review_rating_container.find("div", class_="review-ring-score__score").text.strip()

    # Extract the good and bad aspects
    review_breakdown = soup.find("div", class_="review-breakdown__lists")
    good_aspects = [li.text.strip() for li in review_breakdown.find("ul", class_="review-breakdown__list").find_all("li")]
    bad_aspects = [li.text.strip() for li in review_breakdown.find_all("ul", class_="review-breakdown__list")[1].find_all("li")]

    print("Image URL:", image_url)
    print("Title:", title)
    print("Content:")
    print(content)
    print("Review Rating:", review_rating)
    print("Good Aspects:", good_aspects)
    print("Bad Aspects:", bad_aspects)
else:
    print("Failed to load the page. Status code:", response.status_code)


# Scrapping 2000+ Articles on gaming reviews

In [26]:
import csv
import requests
from bs4 import BeautifulSoup
import random
import time

# Function to scrape article details
def scrape_article_details(url, proxy):
    try:
        proxies = {"http": proxy, "https": proxy}
        response = requests.get(url, proxies=proxies)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            #extracting the article
            image_url = soup.find("div", class_="kubrick-lead")["style"].split("url(")[1].split(")")[0]

            #extracting the articles
            title = soup.find("h1", class_="kubrick-info__title").text.strip()

            #extracting all the pargraphs within the article body only
            article_section = soup.find("section", class_="article-body typography-format")
            paragraphs = article_section.find_all("p")
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            # Extract review rating
            review_rating_container = soup.find("div", class_="review-breakdown__score-container")
            review_rating = review_rating_container.find("div", class_="review-ring-score__score").text.strip()

            # Extract the good and bad aspects
            review_breakdown = soup.find("div", class_="review-breakdown__lists")
            good_aspects = [li.text.strip() for li in review_breakdown.find("ul", class_="review-breakdown__list").find_all("li")]
            bad_aspects = [li.text.strip() for li in review_breakdown.find_all("ul", class_="review-breakdown__list")[1].find_all("li")]

            # Return the scraped data
            return {
                "Proxy":proxies,
                "URL": url,
                "Image URL": image_url,
                "Title": title,
                "Content": content,
                "Review Rating": review_rating,
                "Good Aspects": good_aspects,
                "Bad Aspects": bad_aspects
            }
    except Exception as e:
        print(f"An error occurred with proxy {proxy} while scraping {url}: {str(e)}")
        return None

#Function to rotate proxy IPs
def rotate_proxies(proxies):
    while True:
        yield random.choice(proxies)

# Function to scrape article details using rotating proxies
def scrape_with_rotating_proxies(article_links, proxy_ips):
    with open("gaming_reviews.csv", mode='w', newline='', encoding='utf-8') as file:
        fieldnames = ["Proxy","URL", "Image URL", "Title", "Content", "Review Rating", "Good Aspects", "Bad Aspects"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        count = 0
        proxy_rotator = rotate_proxies(proxy_ips)
        for article_link in article_links:
            proxy = next(proxy_rotator)
            scraped_data = scrape_article_details(article_link, proxy)
            if scraped_data:
                writer.writerow(scraped_data)
                print(f"{count } Scraped data for {article_link} saved to 'scraped_data.csv'.",proxy)
                count+=1
                time.sleep(2)  #Delay
            else:
                print(f"Failed to scrape data for {article_link}.")

#reading links from the csv file
article_links = []
with open("article_links.csv", mode='r', newline='', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        article_links.append(row["Article Link"])

#reading the valid ips
proxy_ips = []
with open("valid-proxies.txt", mode='r', newline='', encoding='utf-8') as file:
    for line in file:
        proxy_ips.append(line.strip())

# Scrape article details using rotating proxies
scrape_with_rotating_proxies(article_links, proxy_ips)


0 Scraped data for https://www.gamespot.com/reviews/tekken-8-review-the-heat-of-battle/1900-6418169/ saved to 'scraped_data.csv'. 38.62.222.89:3128
1 Scraped data for https://www.gamespot.com/reviews/silent-hill-the-short-message-review-in-my-restless-dreams-i-flee-that-town/1900-6418173/ saved to 'scraped_data.csv'. 154.6.98.169:3128
2 Scraped data for https://www.gamespot.com/reviews/persona-3-reload-review-burn-your-dread/1900-6418171/ saved to 'scraped_data.csv'. 154.6.96.178:3128
3 Scraped data for https://www.gamespot.com/reviews/like-a-dragon-infinite-wealth-review-the-things-money-cant-buy/1900-6418170/ saved to 'scraped_data.csv'. 38.62.221.150:3128
4 Scraped data for https://www.gamespot.com/reviews/another-code-recollection-full-of-mysteries/1900-6418168/ saved to 'scraped_data.csv'. 154.6.96.197:3128
5 Scraped data for https://www.gamespot.com/reviews/prince-of-persia-the-lost-crown-review-crowning-achievement/1900-6418167/ saved to 'scraped_data.csv'. 38.62.221.140:3128
6 