In [2]:
import requests
from bs4 import BeautifulSoup
import csv

# URL of the page to scrape
url = "https://www.mubawab.ma/en/ct/casablanca/real-estate-for-sale:p:"

# Open a CSV file to write the data to
with open('scrapper/data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    # Define the field names for the CSV file
    fieldnames = ['Title', 'Date', 'Size', 'Location', 'Description', 'Price']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header row to the CSV file
    writer.writeheader()

    # Loop over the first five pages of search results
    for page in range(1, 40):
        # Send a request to the URL and get the HTML content
        response = requests.get(url + str(page))
        html_content = response.content

        # Parse the HTML content with Beautiful Soup
        soup = BeautifulSoup(html_content, 'html.parser')
        price_elem = soup.find_all('span', class_=lambda x: x and 'priceTag' in x)
        p=0
        
        for listing in soup.find_all(class_=lambda x: x and 'contentBox' in x):
            # Initialize the row dictionary
            row = {}
            row['Price']= price_elem[p]
            # Extract the title, date, size, price, location, and description from the listing
            title_elem = listing.find('h2', class_='listingTit')
            row['Title'] = title_elem.text.strip() if title_elem is not None else None

            date_elem = listing.find('span', class_='listingDetails iconPadR')
            row['Date'] = date_elem.text.strip() if date_elem is not None else None

            size_elem = listing.find(['h4', 'p'], {'class': 'listingH4 floatR'})
            row['Size'] = size_elem.text.strip() if size_elem is not None else None



            location_elem = listing.find('h3', class_='listingH3')
            row['Location'] = location_elem.text.strip() if location_elem is not None else None

            description_elem = listing.find('p', class_='listingP descLi')
            row['Description'] = description_elem.text.strip() if description_elem is not None else None

            # Write the row to the CSV file
            writer.writerow(row)
            p=p+1
        

In [3]:
import pandas as pd
df=pd.read_csv('scrapper/data.csv')
df.head()

Unnamed: 0,Title,Date,Size,Location,Description,Price
0,Apartment to purchase in Oasis. Surface a...,Published today,"3 bedrooms, 128 m²",Oasis in\n\t \t\t\t\t\tCasablanca,don't miss out on this apartment for sale. pri...,"<span class=""priceTag hardShadow float-right f..."
1,Beautiful apartment for sale in Oasis. 3...,Published today,"2 bedrooms, 98 m²",Oasis in\n\t \t\t\t\t\tCasablanca,amazing deal on this apartment for sale. price...,"<span class=""priceTag hardShadow float-right f..."
2,Apartment to purchase in Maârif Extension...,Published today,"2 bedrooms, 79 m²",Maârif Extension in\n\t \t\t\t\t\tCasablanca,ideal for investors or young professionals loo...,"<span class=""priceTag hardShadow float-right f..."
3,Apartment to purchase in Ahl Loghlam (Hay...,Published today,"3 bedrooms, 82 m²",Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,"<span class=""priceTag hardShadow float-right f..."
4,Apartment to purchase in Ahl Loghlam (Hay...,Published today,"3 bedrooms, 67 m²",Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,"<span class=""priceTag hardShadow float-right f..."


In [4]:
import re
df['Price'] = df['Price'].apply(lambda x: re.sub('[^0-9,]', '', x))  # remove non-numeric characters
df['Price'] = df['Price'].apply(lambda x: re.sub(',', '.', x))  # replace commas with dots


In [5]:
df.head()

Unnamed: 0,Title,Date,Size,Location,Description,Price
0,Apartment to purchase in Oasis. Surface a...,Published today,"3 bedrooms, 128 m²",Oasis in\n\t \t\t\t\t\tCasablanca,don't miss out on this apartment for sale. pri...,2.640.000
1,Beautiful apartment for sale in Oasis. 3...,Published today,"2 bedrooms, 98 m²",Oasis in\n\t \t\t\t\t\tCasablanca,amazing deal on this apartment for sale. price...,1.900.000
2,Apartment to purchase in Maârif Extension...,Published today,"2 bedrooms, 79 m²",Maârif Extension in\n\t \t\t\t\t\tCasablanca,ideal for investors or young professionals loo...,1.561.000
3,Apartment to purchase in Ahl Loghlam (Hay...,Published today,"3 bedrooms, 82 m²",Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,340.000
4,Apartment to purchase in Ahl Loghlam (Hay...,Published today,"3 bedrooms, 67 m²",Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,250.000


In [6]:
df[['NumBedrooms', 'Size']] = df['Size'].str.split(',', expand=True)
df.head()

Unnamed: 0,Title,Date,Size,Location,Description,Price,NumBedrooms
0,Apartment to purchase in Oasis. Surface a...,Published today,128 m²,Oasis in\n\t \t\t\t\t\tCasablanca,don't miss out on this apartment for sale. pri...,2.640.000,3 bedrooms
1,Beautiful apartment for sale in Oasis. 3...,Published today,98 m²,Oasis in\n\t \t\t\t\t\tCasablanca,amazing deal on this apartment for sale. price...,1.900.000,2 bedrooms
2,Apartment to purchase in Maârif Extension...,Published today,79 m²,Maârif Extension in\n\t \t\t\t\t\tCasablanca,ideal for investors or young professionals loo...,1.561.000,2 bedrooms
3,Apartment to purchase in Ahl Loghlam (Hay...,Published today,82 m²,Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,340.000,3 bedrooms
4,Apartment to purchase in Ahl Loghlam (Hay...,Published today,67 m²,Ahl Loghlam (Hay Assalam) in\n\t \t\t\t\t\tCas...,it is in the heart of the new city of tit mell...,250.000,3 bedrooms


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326 entries, 0 to 1325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        1326 non-null   object
 1   Date         1287 non-null   object
 2   Size         1284 non-null   object
 3   Location     1326 non-null   object
 4   Description  1326 non-null   object
 5   Price        1326 non-null   object
 6   NumBedrooms  1287 non-null   object
dtypes: object(7)
memory usage: 72.6+ KB


In [24]:

df['Location'] = df['Location'].apply(lambda x: re.sub('in\n   \t\t\t\t\tCasablanca', '', x))  # replace commas with dots
df['Location'] = df['Location'].apply(lambda x: re.sub('in\n   \t\t\t\t\t\tCasablanca', '', x))  # replace commas with dots
df['Location'] = df['Location'].apply(lambda x: re.sub('Casablanca', '', x))  # replace commas with dots

df['Location'] = df['Location'].apply(lambda x: re.sub('in\n\t \t\t\t\t\t', '', x))  # replace commas with dots
df['Location'] = df['Location'].apply(lambda x: re.sub('in\n \t\t\t\t\t', '', x))  # replace commas with dots

df.head()

Unnamed: 0,Title,Date,Size,Location,Description,Price,NumBedrooms
0,Apartment to purchase in Oasis. Surface a...,Published today,128 m²,Oasis,don't miss out on this apartment for sale. pri...,2.640.000,3
1,Beautiful apartment for sale in Oasis. 3...,Published today,98 m²,Oasis,amazing deal on this apartment for sale. price...,1.900.000,2
2,Apartment to purchase in Maârif Extension...,Published today,79 m²,Maârif Extension,ideal for investors or young professionals loo...,1.561.000,2
3,Apartment to purchase in Ahl Loghlam (Hay...,Published today,82 m²,Ahl Loghlam (Hay Assalam),it is in the heart of the new city of tit mell...,340.000,3
4,Apartment to purchase in Ahl Loghlam (Hay...,Published today,67 m²,Ahl Loghlam (Hay Assalam),it is in the heart of the new city of tit mell...,250.000,3


In [12]:

df['NumBedrooms'] = df['NumBedrooms'].str.extract('(\d+)', expand=False)  # extract the digits from the string

df.head()

Unnamed: 0,Title,Date,Size,Location,Description,Price,NumBedrooms
0,Apartment to purchase in Oasis. Surface a...,Published today,128 m²,Oasis Casablanca,don't miss out on this apartment for sale. pri...,2.640.000,3
1,Beautiful apartment for sale in Oasis. 3...,Published today,98 m²,Oasis Casablanca,amazing deal on this apartment for sale. price...,1.900.000,2
2,Apartment to purchase in Maârif Extension...,Published today,79 m²,Maârif Extension Casablanca,ideal for investors or young professionals loo...,1.561.000,2
3,Apartment to purchase in Ahl Loghlam (Hay...,Published today,82 m²,Ahl Loghlam (Hay Assalam) Casablanca,it is in the heart of the new city of tit mell...,340.000,3
4,Apartment to purchase in Ahl Loghlam (Hay...,Published today,67 m²,Ahl Loghlam (Hay Assalam) Casablanca,it is in the heart of the new city of tit mell...,250.000,3


In [25]:
df.describe()

Unnamed: 0,Title,Date,Size,Location,Description,Price,NumBedrooms
count,1326,1287,1284,1326,1326,1326.0,1287
unique,839,8,376,93,1156,573.0,12
top,Apartment to purchase in Al Azhar. 1 love...,Published today,50 m²,Californie,the boulevard de biarritz becomes the new luxu...,250.0,3
freq,20,454,23,107,13,50.0,436
