In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

# Declare a list to store information about room types
list_room_type = [
    {"index": 1, "name": "House", "link": "https://mogi.vn/ho-chi-minh/thue-nha", "pageNumber": 1000},
    {"index": 2, "name": "Apartment", "link": "https://mogi.vn/ho-chi-minh/thue-can-ho", "pageNumber": 1000},
    {"index": 3, "name": "Room for Rent", "link": "https://mogi.vn/ho-chi-minh/thue-phong-tro-nha-tro", "pageNumber": 1000},
]

# Flag variable to check if the header has been written or not
header_written = False

for room_type in list_room_type:
    for page_number in range(room_type['pageNumber']):
        url = room_type['link'] + "?cp=" + str(page_number)
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            room_name_elements = soup.find_all('a', class_='link-overlay')
            room_price_elements = soup.find_all('div', {'class': 'price'})
            location_elements = soup.find_all('div', {'class': 'prop-addr'})
            time_elements = soup.find_all('div', {'class': 'prop-created'})
            
            # Collect information about area, number of bedrooms, and number of bathrooms
            prop_attr_elements = soup.find_all('ul', class_='prop-attr')
            
            # Open a CSV file to write data
            with open('raw_data_mogi2.csv', 'a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                
                # If the header hasn't been written yet, write the header
                if not header_written:
                    writer.writerow(['Type', 'Room Name', 'Price', 'Location', 'Time', 'Area', 'Bedrooms', 'Bathrooms'])
                    header_written = True  # Mark that the header has been written
                
                # Iterate through and write the data rows
                for room_name, room_price, location, time, prop_attr in zip(room_name_elements, room_price_elements, location_elements, time_elements, prop_attr_elements):
                    room_name_text = room_name.text.strip() if room_name else "No information"
                    room_price_text = room_price.text.strip() if room_price else "No information"
                    location_text = location.text.strip() if location else "No information"
                    time_text = time.text.strip() if time else "No information"
                    
                    # Extract information about area, number of bedrooms, and number of bathrooms from the list of li tags
                    area_text = prop_attr.find('li').text.strip()
                    bedrooms_text = prop_attr.find_all('li')[1].text.strip()
                    bathrooms_text = prop_attr.find_all('li')[2].text.strip()
                    
                    writer.writerow([room_type['name'], room_name_text, room_price_text, location_text, time_text, area_text, bedrooms_text, bathrooms_text])
        else:
            print(f"Unable to connect to the page {url}. Status code: {response.status_code}")

# Load and print the collected data
data = pd.read_csv("raw_data_mogi2.csv", encoding='utf-8', index_col=0)
print(f"Data collection complete. Total records collected: {data.shape[0]}")
print(data)
