In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from utils.helpers import *

In [18]:
client = get_mongodb_client()
RAW_DATABASE = 'scrapy'
DES_DATABASE = 'real_estate'

ITEM_COLLECTION = 'item'
NEWS_COLLECTION = 'new'

PROJECT_COLLECTION = 'project'
COMMUNE_ADDRESS_COLLECTION = 'commune_address'
BASEPROJECT_COLLECTION = 'base_project'

In [19]:
item_collection = client[RAW_DATABASE][ITEM_COLLECTION]
news_collection = client[DES_DATABASE][NEWS_COLLECTION]

project_collection = client[DES_DATABASE][PROJECT_COLLECTION]
commune_address_collection = client[DES_DATABASE][COMMUNE_ADDRESS_COLLECTION]
base_project_collection = client[DES_DATABASE][BASEPROJECT_COLLECTION]

### Get latest date process

In [20]:
max_createdAt = news_collection.find_one({}, sort=[("createdAt", -1)])["createdAt"]
max_process_id = news_collection.find_one({}, sort=[("process_id", -1)])["process_id"]

In [22]:
this_process_id = max_process_id + 1

### Get data not process

In [24]:
import pandas as pd
item_response = item_collection.find({'createdAt': {
    "$gt": max_createdAt
}})
full_raw_data = list(item_response)

In [25]:
len(full_raw_data)

687

### Read Project data

In [28]:
projects_response = project_collection.find({})
full_project_df = pd.DataFrame(list(projects_response))

### Remove missing address data

In [29]:
trash_data = []
used_data = []


for news in full_raw_data:
    if ('project' in news and check_valid_string_field(news['project'])) or ('district' in news and check_valid_string_field(news['district'])):
        used_data.append(news)
    else:
        news['reason'] = "Can't find address from this item"
        trash_data.append(news)

print(len(used_data))

687


### Remove missing price or square

In [30]:
used_data_no_missing = []
unused_data = []
for item in used_data:
    if 'total_price' not in item or 'square' not in item or 'price_per_m2' not in item:
        print(item)
        item['reason'] = 'Missing price'
        unused_data.append(item)
        continue
    price = item['total_price']
    square = item['square']
    price_per_m2 = item['price_per_m2']

    if price == 0 or price is None or square == 0 or square is None or price_per_m2 == 0 or price_per_m2 is None:
        item['reason'] = 'Missing price'
        unused_data.append(item)
        continue
    used_data_no_missing.append(item)

In [31]:
len(used_data_no_missing)

669

### Remove outlier

In [None]:
import numpy as np
price_per_m2_values = np.array([d['price_per_m2'] for d in used_data_no_missing])
square_values = np.array([d['square'] for d in used_data_no_missing])

In [33]:
used_data_no_outlier = [d for d in used_data_no_missing if 3e6 < d['price_per_m2'] < 3e8 and d['square'] > 20]
len(used_data_no_outlier)

656

### Remove duplicates

In [36]:
from unidecode import unidecode
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

duplicate_item = []
tokens = [
    unidecode(d['title']).strip().lower().split() for d in used_data_no_outlier
]
smoothing_function = SmoothingFunction().method2
list_unique_item = []
processed_item = set()
for i in tqdm(range(len(used_data_no_outlier))):
    item = used_data_no_outlier[i]
    if i in processed_item:
        continue
    processed_item.add(i)
    unique_item = item

    reference_tokens = [tokens[i]]
    for j in range(len(used_data_no_outlier)):
        if j in processed_item:
            continue
        another_item = used_data_no_outlier[j]
        candidate_tokens = tokens[j]
        if item['square'] == another_item['square'] and is_approximately(unique_item['total_price'], another_item['total_price']):
            if sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5), smoothing_function=smoothing_function) > 0.9:
                processed_item.add(j)
                duplicate_item.append([unique_item['news_url'], another_item['news_url']])
                if unique_item['published_at'] < another_item['published_at']:
                    unique_item = another_item
    list_unique_item.append(unique_item)


100%|██████████| 656/656 [00:00<00:00, 5702.10it/s]


In [37]:
len(list_unique_item)

629

### Compare to old news in database

In [38]:
from datetime import datetime, timedelta
prev_date = datetime.now() - timedelta(days=30)

In [39]:
old_news = list(news_collection.find({
    'published_at': {
        '$gt': prev_date
    }
    },
    {
        'title': 1,
        'square': 1,
        'total_price': 1
    }
))

In [40]:
old_tokens = [
    d['title'].split() for d in old_news
]

list_unique_item_from_old = []
for i in tqdm(range(len(list_unique_item))):
    item = list_unique_item[i]

    reference_tokens = [tokens[i]]
    is_duplicate = False
    for j in range(len(old_news)):
        another_item = old_news[j]
        candidate_tokens = old_tokens[j]
        if item['square'] == another_item['square']:
            if sentence_bleu(reference_tokens, candidate_tokens, weights=(0.5, 0.5), smoothing_function=smoothing_function) > 0.9:
                if item['published_at'] - timedelta(days=15) <  another_item['published_at']:
                    is_duplicate = True
                    break
    if not is_duplicate:
        list_unique_item_from_old.append(item)  

100%|██████████| 629/629 [00:03<00:00, 157.38it/s]


In [41]:
len(list_unique_item_from_old)

629

### Add location

In [None]:
class GoongParser:
    _instance = None
    _connection = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(GoongParser, cls).__new__(cls, *args, **kwargs)
        return cls._instance

    def __init__(self):
        if not self._connection:
            self._connection = self._connect_to_server()

    def _connect_to_server(self):
        # Implement your connection logic here
        # This could involve creating a network connection, authenticating, etc.
        conn = http.client.HTTPSConnection('rsapi.goong.io')
        return conn

    def _reconnect(self):
        self._connection = self._connect_to_server()
        
    def parse_address(self, address):
        params = urllib.parse.urlencode({
            'address': address,
            'api_key': config.get(GOONG_CONGIG_NAME, 'key')
        })
        # Implement the geocoding logic using the server connection
        self._connection.request('GET', '/Geocode?{}'.format(params))

        res = self._connection.getresponse()
        data = res.read()
        return json.loads(data)

    def get_connection(self):
        return self._connection
    
    def finalize(self):
        # Implement any cleanup logic here (e.g., closing the connection)
        self._connection.close()




In [42]:
list_domain = {
    'alonhadat': 'https://alonhadat.com.vn/',
    'homedy': 'https://homedy.com/'
}

In [43]:
type_of_project = ['căn hộ chung cư', 'khu biệt thự', 'nhà phố', 'khu phức hợp', 'căn hộ dịch vụ', 'khu nghỉ dưỡng', 'cao ốc văn phòng', 'khu thương mại', 'khu dân cư', 'nhà ở xã hội', 'khu đô thị mới', 'khu tái định cư', 'khu đô thị', 'tòa nhà']

def extract_name_project(project: str):
    project = project.strip().lower()
    for kind in type_of_project:
        project = project.replace(kind, '')
    project = project.strip().title()
    return project

In [46]:
def get_loc_by_address(address):
    res = list(commune_address_collection.find({'formatted_address': address}))
    if len(res) > 0:
        return res[0]['loc']
    return None

In [47]:
list_project = list(full_project_df['formatted_name'])

In [None]:
unseen_project = set()
for news in tqdm(list_unique_item_from_old):
    if 'project' in news and check_valid_string_field(news['project']):
        name = news['project']
        formatted_address = get_formatted_address(news['commune'], news['district'], news['province'])

        url = None
        if 'project_url' in news and news['project_url'] is not None:
            domain = list_domain[news['source']]
            url =  get_absolute_path(news['project_url'], domain)

        reduced_name = extract_name_project(name)
        if get_formatted_string(reduced_name) not in formatted_address:
            name = reduced_name
        match_project = full_project_df[full_project_df['formatted_name'] == get_formatted_string(name)]

        if len(match_project) == 0:
            query = ', '.join([name, news['district'], news['province']])
            unseen_project.add((name, news['source'],url, query))

In [62]:
import time
list_unseen_project = []
parser = GoongParser()
i = 0
for prj in unseen_project:
    print(i)
    i += 1
    new_prj = {}
    new_prj['name'] = prj[0]
    new_prj['source'] = prj[1]
    new_prj['url'] = prj[2]
    new_prj['formatted_name'] = get_formatted_string(prj[0])

    try:
        res = parser.parse_address(prj[3])
        new_prj['parser_response'] = res
    except:
        parser._reconnect()
        res = parser.parse_address(prj[3])
        new_prj['parser_response'] = res
    time.sleep(1.5)
    list_unseen_project.append(new_prj)

0
1
2
3
4
5
6
7


In [None]:
list_new_project = []
list_match_project = []
for new_prj in list_unseen_project:
    name = new_prj['name']
    print(name)
    new_prj['loc'] = {
        'type': 'Point',
        'coordinates': [
            new_prj['parser_response']['results'][0]['geometry']['location']['lng'],
            new_prj['parser_response']['results'][0]['geometry']['location']['lat'],
        ]
    }

    new_prj['address'] = {
        'name':  new_prj['parser_response']['results'][0]['name'],
        'address': new_prj['parser_response']['results'][0]['address'],
        'compound':  new_prj['parser_response']['results'][0]['compound'],
        'formatted_compound': get_formatted_compound(new_prj['parser_response']['results'][0]['compound'])
    }
    is_match = False
    for id, row in full_project_df.iterrows():
        if string_equal(name, row['name']):
            if calculate_distance(new_prj['loc'], row['loc']) < 0.5:
                print(calculate_distance(new_prj['loc'], row['loc']))
                new_prj['base_project'] = row['base_project']
                list_match_project.append(new_prj)
                is_match = True
                break
    if not is_match:
        list_new_project.append(new_prj)

In [66]:
project_collection.insert_many(list_match_project)

<pymongo.results.InsertManyResult at 0x7f4c1f1b79d0>

In [69]:
base_project_collection = client['real_estate']['base_project']
result = base_project_collection.find_one({}, sort=[('project_id', -1)], projection={'project_id': 1})
max_project_id = result['project_id']

i = 1
for base in list_new_project:
    base['project_id'] = max_project_id + i
    i += 1

In [70]:
base_project_collection.insert_many(list_new_project)

<pymongo.results.InsertManyResult at 0x7f4c492298e0>

In [71]:
for new_prj in list_new_project:
    prj = {
        'name': new_prj['name'],
        'source': new_prj['source'],
        'url': new_prj['url'],
        'loc': new_prj['loc'],
        'parser_response': new_prj['parser_response'],
        'address': new_prj['address'],
        'formatted_name': new_prj['formatted_name'],
        'base_project': {
            'project_id': new_prj['project_id'],
            'name': new_prj['name'],
            'address': new_prj['address'],
            'loc': new_prj['loc'],
            'url': new_prj['url'],
            'source': new_prj['source']
        }
    }
    project_collection.insert_one(prj)

In [72]:
projects_response = project_collection.find({})
full_project_df = pd.DataFrame(list(projects_response))

In [73]:
used_data_with_loc = []
commune_address_collection = client['real_estate']['commune_address']
for news in tqdm(list_unique_item_from_old):
    if 'project' in news and check_valid_string_field(news['project']):
        name = news['project']
        reduced_name = extract_name_project(name)
        formatted_address = get_formatted_address(news['commune'], news['district'], news['province'])
        if get_formatted_string(reduced_name) not in formatted_address:
            name = reduced_name
        
        name = get_formatted_string(name)

        match_project = full_project_df[full_project_df['formatted_name'] == name]

        if len(match_project) == 0:
            print('Have project but not found')
            continue
        
        news['loc'] = match_project.iloc[0]['loc']
        news['base_project'] = match_project.iloc[0]['base_project']
        news['location_confidence'] = 1
    else:
        news['project'] = None
        news['project_url'] = None
        formatted_address = get_formatted_address(news['commune'], news['district'], news['province'])
        query = {
            'formatted_address': formatted_address
        }

        res = list(commune_address_collection.find(query))
        if len(res) == 0:
            print('Address not found')
            continue
        else:
            news['loc'] = res[0]['loc']
            
        if news['commune']:
            news['location_confidence'] = 0
        else:
            news['location_confidence'] = -1
            query_ = {
                'formatted_compound.district': unidecode(news['district']).strip().lower(),
                'formatted_compound.province': unidecode(news['province']).strip().lower()
            }   
            res_ = list(commune_address_collection.find(query_))
            news['list_commune_match'] = []
            for loc in res_:
                news['list_commune_match'].append(loc['loc'])
    
    used_data_with_loc.append(news)
print(len(used_data_with_loc))

 64%|██████▎   | 400/629 [00:19<00:03, 69.60it/s]

Address not found


100%|██████████| 629/629 [00:24<00:00, 25.42it/s]

628





In [74]:
from datetime import datetime
for i in range(len(used_data_with_loc)):
    new = used_data_with_loc[i]
    new['process_id'] = this_process_id
    new['transformAt'] = datetime.now() 
    

In [75]:
def is_valid_coordinates(coord):
    latitude = coord[1]
    longitude = coord[0]
    if latitude < -90 or latitude > 90:
        return False
    
    if longitude < -180 or longitude > 180:
        return False
    
    return True

In [76]:
for new in used_data_with_loc:
    coordinates = new['loc']['coordinates']
    if not is_valid_coordinates(coordinates):
        print(new)

In [77]:
new_collection = client['real_estate']['new']
new_collection.insert_many(used_data_with_loc)

<pymongo.results.InsertManyResult at 0x7f4c02c7ad90>

In [78]:
client.close()