# Crawl data and preprocessing notebook

## Crawl data

### Import libs

In [85]:
import requests
import string
import csv
import os
import time
import numpy as np
import random
import pandas as pd
import pymongo
import re
import glob
import py_vncorenlp
pd.set_option('display.max_columns', None)

### Variables definition

In [104]:
BASE_URL = 'https://gateway.chotot.com/v1/public/ad-listing'

LIMIT = 100
# Vị trí: HCM
REGION_V2 = 13000
# Category: Bất động sản
CG = 1000
# Type: Mua bán
ST = "s,k"

# MongoDB constants
ATLAS_URL = "mongodb+srv://conghuynt1999:foWcElkNDeLlDNIX@cluster1.oq1l7.mongodb.net/?retryWrites=false&replicaSet=atlas-5tjaoq-shard-0"
DATABASE_NAME = "chotot"
COLLECTION_NAME = "real_estate_for_sell"

# Crawl data 
ERROR_LIMIT = 6
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
    'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
]

# NLP
VN_CORE_NLP_PATH = "/Users/viet/Documents/other/p4ds/vncorenlp"

### Scrape data function

In [45]:
def scrape_data():
    """Fetches data from Cho Tot listings for the given city code.

    Args:
        city_code (int, optional): The city code to fetch data for. Defaults to 13000 (Ho Chi Minh City).

    Returns:
        list: A list of dictionaries containing the fetched data.
    """
    error = 0
    data = []
    previous = time.time()

    while (True):
        page = 0
        o = -20

        while (True):
            try:

                page = page + 1
                o = o + 20
                url = BASE_URL + 'region_v2=' + str(REGION_V2) + '&area_v2=' + str(AREA_CODE) + '&cg=' + str(CG) + '&o=' + str(o) + '&page=' + str(page) + '&st=s,k&limit=20&key_param_included=true'
                headers = {'User-Agent': random.choice(USER_AGENTS)}
                # print(headers)
                r = requests.get(headers=headers, url=url)
                # print(r.content)
                r.json()
                if 0 == len(r.json()['ads']):
                    break
                data.extend(r.json()['ads'])
                delta = time.time() - previous
                previous = time.time()
            except:
                pass

            time.sleep(np.random.choice([x/10 for x in range(3, 12)]))

        if (page == 1):
            error += 1
        else:
            pass
        if (error > ERROR_LIMIT):
            break
        AREA_CODE += 1

    print('\nFinish %d items' % (len(data)))
    return data

### Save data to csv function

In [46]:
def save_data_to_csv(data, csv_file_path):
    """Saves the collected data to a CSV file.

    Args:
        data (list): A list of dictionaries containing the data to save.
        csv_file_path (str, optional): The path to the CSV file. Defaults to 'ads.csv'.
    """

    with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = set()
        for row in data:
            fieldnames.update(row.keys())

        # Check for expected columns
        expected_columns = data[0].keys() if data else []
        for col in expected_columns:
            if col not in fieldnames:
                fieldnames.add(col)

        writer = csv.DictWriter(csvfile, fieldnames=list(fieldnames))
        if not os.path.exists(csv_file_path) or os.stat(csv_file_path).st_size == 0:
            writer.writeheader()
        writer.writerows(data)
    print(f"Data saved to {csv_file_path}")

### Perform craw data and export to csv

In [57]:
save_data_to_csv(scrape_data(), './data/raw_data_0.csv')

KeyboardInterrupt: 

### Load data from saved csv (only in local)

In [74]:
# Specify the folder containing CSV files
folder_path = './data/'

# Get a list of all CSV files in the folder
file_list = glob.glob(folder_path + '*.csv')
selected_columns = ['list_id', 'price', 'price_million_per_m2', 'size', 'width', 'length', 'area', 'subject', 'body', 'category_name', 'latitude', 'list_time', 'longitude', 'toilets', 'rooms', 'floors', 'region_name', 'ward_name', 'area_name', 'street_name', 'street_number', 'owner', 'company_ad', 'apartment_type', 'zero_deposit']

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through each CSV file and read data
for file in file_list:
    df = pd.read_csv(file)

    # Ensure all columns in selected_columns exist and set to NaN if not
    for col in selected_columns:
        if col not in df.columns:
            df[col] = np.nan

    # Set NaN for empty values in columns present in selected_columns
    df[selected_columns] = df[selected_columns].apply(lambda col: col.mask(col == '', np.nan))

    dfs.append(df)

# Concatenate all DataFrames
concatenated_df = pd.concat(dfs, ignore_index=True)

# Remove duplicate rows based on the 'list_id'
concatenated_df = concatenated_df.drop_duplicates(subset='list_id')

# Extract only the selected columns from the concatenated_df
houses_df = concatenated_df.dropna(how='all')
houses_df = houses_df[selected_columns]
houses_df = houses_df.reset_index(drop=True)

  df = pd.read_csv(file)
  df = pd.read_csv(file)


## Preprocessing data

### Keep house and apartment rows

In [75]:
houses_df = houses_df[(houses_df['category_name'] != 'Đất') & (houses_df['category_name'] != 'Văn phòng, Mặt bằng kinh doanh')]

### Fill missing value for column `price_million_per_m2`

In [76]:
houses_df['price_million_per_m2'] = houses_df['price'] / houses_df['area']

# Convert the result to million units
houses_df['price_million_per_m2'] = houses_df['price_million_per_m2'] / 1e6

### Handle missing values

In [77]:
# drop rows with missing values
houses_df = houses_df[houses_df["longitude"].isna() == False]
houses_df = houses_df[houses_df["latitude"].isna() == False]
# fill missing values
houses_df["toilets"].fillna(0, inplace=True)
houses_df["ward_name"].fillna("", inplace=True)
houses_df["street_name"].fillna("", inplace=True)
houses_df["owner"].fillna(0, inplace=True)
houses_df["company_ad"].fillna(0, inplace=True)


### Cast boolean columns to int

In [78]:
houses_df["owner"] = houses_df["owner"].astype(int)
houses_df["company_ad"] = houses_df["company_ad"].astype(int)

### Concatenate fields into one colum

In [79]:
address = (
    houses_df["street_name"]
    + ", "
    + houses_df["ward_name"]
    + ", "
    + houses_df["area_name"]
    + ", "
    + houses_df["region_name"]
)
houses_df["text"] = houses_df["subject"] + " " + houses_df["body"] + " " + address

### One hot encoding for aparment type

In [80]:
columns_to_one_hot_encode = ["apartment_type"]
# one hot encode
houses_df = pd.get_dummies(houses_df, columns=columns_to_one_hot_encode)

### Process data in `subject` and `body`

#### Remove punctuation

In [82]:
def remove_punctuation(text):
    try:
        return text.translate(str.maketrans("", "", PUNCT_TO_REMOVE))
    except Exception as e:
        print(text)
        return text

#### Remove urls

In [83]:
def remove_urls(text):
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)

#### Remove emojis

In [84]:
def remove_emojis(text):
    regrex_pattern = re.compile(
        pattern="["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese char
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    return regrex_pattern.sub(r"", text)

#### NLP processing for `text` column

In [99]:
# lowercase
houses_df["text"] = houses_df["text"].str.lower()

# remove punctuations
PUNCT_TO_REMOVE = string.punctuation

houses_df["text"] = houses_df["text"].apply(lambda x: remove_punctuation(x))
houses_df["text"] = houses_df["text"].apply(lambda x: remove_urls(x))
houses_df["text"] = houses_df["text"].apply(lambda x: remove_emojis(x))

In [110]:

# word segmentation
# Check if the directory or file already exists
if not os.path.exists(VN_CORE_NLP_PATH):
    # If it doesn't exist, download the model
    py_vncorenlp.download_model(save_dir=VN_CORE_NLP_PATH)
else:
    print(f"The VnCoreNLP model is already present in {VN_CORE_NLP_PATH}. No need to download.")
# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=VN_CORE_NLP_PATH)


def word_segment(text):
    return "".join(rdrsegmenter.word_segment(text))


houses_df["text"] = houses_df["text"].progress_apply(lambda x: word_segment(x))

The VnCoreNLP model is already present in /Users/viet/Documents/other/p4ds/vncorenlp. No need to download.


ValueError: VM is already running, can't set classpath/options; VM started at  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/traitlets/config/application.py", line 1053, in launch_instance
    app.start()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 737, in start
    self.io_loop.start()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/asyncio/base_events.py", line 1899, in _run_once
    handle._run()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 524, in dispatch_queue
    await self.process_one()
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in process_one
    await dispatch(*args)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 418, in dispatch_shell
    await result
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 758, in execute_request
    reply_content = await reply_content
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 426, in do_execute
    res = shell.run_cell(
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3046, in run_cell
    result = self._run_cell(
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3101, in _run_cell
    result = runner(coro)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3306, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3488, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3548, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/6s/39hp1vqd6nb_gjfr68pvl_n00000gn/T/ipykernel_2646/2734409503.py", line 14, in <module>
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=VN_CORE_NLP_PATH)
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/py_vncorenlp/vncorenlp.py", line 53, in __init__
    from jnius import autoclass
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/jnius/__init__.py", line 45, in <module>
    from .reflect import *  # noqa
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/viet/miniconda3/envs/min_ds-env/lib/python3.10/site-packages/jnius/reflect.py", line 19, in <module>
    class Class(JavaClass, metaclass=MetaJavaClass):


### Export data to csv

In [None]:
houses_df.to_csv('real_estate.csv', index=False)

### Upload data to MongoDB

In [113]:
houses_df.shape

(203328, 31)

In [116]:
db = pymongo.MongoClient(ATLAS_URL)[DATABASE_NAME]
collection = db.get_collection(COLLECTION_NAME)

batch_size = 20000

# Insert data in batches
for i in range(0, len(houses_df), batch_size):
    batch = houses_df.iloc[i:i+batch_size].to_dict('records')
    try:
        result = collection.insert_many(batch)
        print(f"Inserted {len(result.inserted_ids)} documents.")
    except Exception as e:
        print(f"Error inserting batch {i}-{i+batch_size}: {e}")


Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 20000 documents.
Inserted 3328 documents.
