
Part B

Pick your favorite between 'query_original' and 'query_dt_c' and let's add some more functionality.

Queries used daily are stored in a csv file in the drive(queries.csv).

Make your class able to instantiate from this csv and create as many instances as lines existing in the csv.
Each line in the csv has two columns 'Query' and 'Number' as supposed.
Csv file path argument shall be optional. If a path is provided it will be used or a default path should be used.
Print the 'query' string of all instances created from the csv. (not instances as whole)

In [None]:
import pandas as pd
from collections.abc import Iterable


class Query:
    def __init__(self, query_string: str, no_of_query: int = 0):
        self.query_string = query_string
        self.no_of_query = no_of_query

    @property
    def no_of_query(self):
        return self._no_of_query

    @no_of_query.setter
    def no_of_query(self, n):
        if n < 0:
            raise ValueError(
                f"no_of_query must be zero or greater not {n}"
            )
        self._no_of_query = n

    def __str__(self):
        return f"{self.__class__.__name__} object qstring={self.query_string}, no_of_query={self.no_of_query}"


class QueryOriginal:
    def __init__(self, queries: str | Iterable[Query] = "queries.csv"):
        if isinstance(queries, str):
            self._load_queries_from_csv(queries)
        elif isinstance(queries, Iterable) and all(
            map(lambda x: isinstance(x, Query), queries)
        ):
            self.query_list = [query for query in queries]
        else:
            raise ValueError(
                "Can only pass a str or an Iterable of Query objects"
            )

    def __iter__(self):
        return iter(self.query_list)

    def _load_queries_from_csv(self, csvpath: str):
        df = pd.read_csv(csvpath)
        self.query_list = [Query(t[1], t[2]) for t in df.itertuples()]

    def __str__(self):
        class_name = self.__class__.__name__
        queries = "\n".join(map(str, self.query_list))
        return f"{class_name}: \n{queries}"

In [51]:
query_collection = QueryOriginal()

In [52]:
print(query_collection)

QueryOriginal: 
Query object qstring=csvtest1, no_of_query=11
Query object qstring=csvtest2, no_of_query=22
Query object qstring=csvtest3, no_of_query=33
Query object qstring=csvtest0, no_of_query=0


In [53]:
for query in query_collection:
    print(query)

Query object qstring=csvtest1, no_of_query=11
Query object qstring=csvtest2, no_of_query=22
Query object qstring=csvtest3, no_of_query=33
Query object qstring=csvtest0, no_of_query=0


create a guessing game to be initialized as a class method. You can for example pick one random integer from 0-10 and have the user guess it in three tries.
Hint: Method for step 3 will not access or use any class attribute. So what type of method will be used?

In [None]:
import random


class GuessingGame:
    @staticmethod
    def start_game():
        correct = random.randint(0, 10)
        print("Start  of game, guess a number from 0 to 10")
        for i in range(3):
            answer = None
            while answer is None:
                answer = input(f"you got {3 - i} attempts left: ")
                try:
                    answer = int(answer)
                except:
                    answer = None
            if answer == correct:
                print("Congrats! You won!")
                break
            else:
                print("wrong answer")
        else:
            print("Game Over")
        play_again = None
        while play_again not in ("y", "n"):
            play_again = input(
                "Would you like to play again? ('y', 'n'): "
            )
        if play_again == "y":
            GuessingGame.start_game()

In [None]:
GuessingGame.start_game()

Part C  

Convert your web scraping code you developed previously into object oriented (use code from flexcar or atp rankings).

## Refactor ATP Scraping Exercise

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import logging
import time
import random


logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler("log.txt"))
logger.setLevel(logging.INFO)


class Week:
    def __init__(self, html_content: str):
        self._soup = BeautifulSoup(html_content, "html.parser")
        self._week = self._get_active_week()
        self._ranks = list(range(1, 101))
        self._player_names = self._get_player_names()
        self._countries = self._get_countries()
        self._points_all = self._get_points()
        sizes = map(
            len,
            [
                self._ranks,
                self._player_names,
                self._countries,
                self._points_all,
            ],
        )
        if not all(v > 0 for v in sizes):
            logger.info(
                f"week {self._week} error\nranks: {self._ranks}\nplayer_names: {self._player_names}\ncountries: {self._countries} \n points_all: {self._points_all}"
            )

    def to_list(self):
        return [
            {
                "week": self._week,
                "rank": rank,
                "player_name": player_name,
                "country": country,
                "points": points,
            }
            for rank, player_name, country, points in zip(
                self._ranks,
                self._player_names,
                self._countries,
                self._points_all,
            )
        ]

    def _get_active_week(self) -> str:
        """
        returns the active week, directly through scraping and not through the url
        unlike the get_all_urls it will extract the text contents instead of
        the value attribute, otherwise we would get date 'Current-Date' for our
        most recent date.
        """

        select_tag = self._soup.find("select", id="dateWeek-filter")
        return select_tag.find("option", selected=True).text

    def _get_player_names(self) -> list[str]:
        """
        returns a list of the top-100 players names ordered by rank
        """
        li_tags = self._soup.find_all("li", class_="name center")
        return [li.find("span").text for li in li_tags]

    def _get_countries(self):
        """
        returns a list of the top-100 players countries ordered by rank
        """
        svg_tags = self._soup.find_all("svg", class_="atp-flag")[:100]
        use_tags = [svg_tag.find("use") for svg_tag in svg_tags]
        links = [use_tag["href"] for use_tag in use_tags]
        countries = [
            self.__class__._extract_flag_abbr(link) for link in links
        ]
        return self._convert_flag_abbr(countries)

    @staticmethod
    def _extract_flag_abbr(string: str) -> str:
        """
        extract the flag abbreviation out of a link to the flag png
        """
        return re.search(r"(?<=#flag-)[A-Za-z]{3}$", string).group(0)

    def _convert_flag_abbr(self, countries: str) -> str:
        """
        helper function to convert a flag/country abbreviation to the
        actual country name. The relationship between the can be find inside
        the source code and I did not have rely on external sources
        """
        select_region_filter = self._soup.find(
            "select", id="region-filter"
        )
        region_option_tags = select_region_filter.find_all("option")
        countries_tuple = [
            (region_option_tag["value"], region_option_tag.text)
            for region_option_tag in region_option_tags
        ]
        country_dict = {k.lower(): v for k, v in countries_tuple}
        return [
            country_dict.get(country_abbr, country_abbr)
            for country_abbr in countries
        ]

    def _get_points(self) -> list[str]:
        """
        after some debugging a edge case failures, I adjusted and tested
        the points extraction to the following code.
        I essentially had to find the tag before the one I was looking for,
        and the seek the sibling.
        The reason for picking the slice is that I sometimes get 101 results
        instead of 100 and in that case the first one does not lead to any points.
        Taking the slice of the last 100 seems safe.
        """
        points_tds = self._soup.find_all("td", class_="age small-cell")
        return [
            points_td.find_next_sibling("td").find("a").text.strip()
            for points_td in points_tds[-100:]
        ]


class AtpDb:
    def __init__(self):
        self._headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
        }
        url = "https://www.atptour.com/en/rankings/singles"
        response = requests.get(url, headers=self._headers)
        self._soup = BeautifulSoup(response.content, "html.parser")
        self._response_contents = self._get_response_contents()
        content_length = len(self._response_contents)
        self._weeks = []
        for i, html_content in enumerate(self._response_contents):
            logger.info(
                f"parsing week {i + 1} out of {content_length}"
            )
            self._weeks.extend(Week(html_content).to_list())

    @property
    def weeks(self):
        return self._weeks

    def _get_all_urls(self) -> list[str]:
        """Return a list with all weekly urls from the main soup object"""
        select_tag = self._soup.find("select", id="dateWeek-filter")
        date_tags = select_tag.find_all("option")
        dates = [
            (
                date_tag["value"]
                if not "Current" in date_tag["value"]
                else "Current+Date"
            )
            for date_tag in date_tags
        ]
        base_url = (
            "https://www.atptour.com/en/rankings/singles?dateWeek="
        )
        return sorted([f"{base_url}{date}" for date in dates])

    def _get_response_contents(self):
        urls = self._get_all_urls()
        urls_length = len(urls)
        response_content_list = []
        for i, url in enumerate(urls):
            logger.info(
                f"processing request {i + 1} out of {urls_length}"
            )
            time.sleep(random.uniform(0.3, 0.4))
            response = requests.get(url, headers=self._headers)
            if response.status_code != 200:
                response_content_list = []
                raise ConnectionError(
                    f"failed to fetch data from {url}"
                )
            response_content_list.append(response.content)
        return response_content_list

In [None]:
import pandas as pd

df = pd.DataFrame(AtpDb().weeks)

In [4]:
df.to_pickle('df.pkl')