# Minh - Price

Load data

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

origin_df = pd.read_csv('data/stage_9.csv', low_memory=False)
origin_df = origin_df.fillna('')
origin_df = origin_df.astype('str')


In [2]:
df = origin_df.copy()

In [3]:
(df.price != '').sum() / df.shape[0]

1.0

In [4]:
# def extract_unit(x):
#     if isinstance(x, str):
#         return ' '.join(x.split(' ')[1:])
#     return ''

# units = df.price.copy().apply(extract_unit)

# units = list(units.unique())

# include_keywords = ['triệu', 'tỷ', 'thỷ', 'tyr', 'triỷ' 't?', 'ngàn']
# exclude_keywords = ['/ phòng', '/tháng', '/\xa0m,2,\xa0\xa0', 'm2', '/\xa0m,2,']

# filtered_units = []
# for u in units:
#     u = u.lower()
#     for k in include_keywords:
#         if k in u:
#             filtered_units.append(u.strip())
#             break

# units = []
# for u in filtered_units:
#     u = u.lower()
#     flag = True
#     for k in exclude_keywords:
#         if k in u:
#             flag = False
#             break
    
#     if flag:
#         units.append(u)

# set(units)

We can see that there are some fields should be long to the price_m2 column. 

There is only 1 field endswith 'đ' in 1_5 and it is equal 0 so it need to be set to NaN.

Some field

So before continuing to extract data from description, we need to move those fields to price_m2 column.



In [5]:
# correct field price_m2

# price_m2_keywords = ['/\xa0m,2,\xa0\xa0', 'm2', '/\xa0m,2,']

# count_move = 0
# for i, row in df.iterrows():
#     if row.price and not row.price_m2:
#         p = row.price.lower()
#         for k in price_m2_keywords:
#             if k in p:
#                 # move to price_m2
#                 df.at[i, 'price_m2'] = row.price
#                 df.at[i, 'price'] = ''
#                 count_move += 1

In [6]:
def process_price(df):
    '''
    Input: data frame
    Output: processed data frame
    '''
    def process_price_cell(x):
        price_pattern = r'\d+,\d+|\d+\.\d+|\d+'
        if x:
            if 'tỷ' in x and 'ngàn' in x:# process 'XXX ty YYY ngan'
                found = re.findall(r'\d+', x)
                if found:
                    x = '.'.join(found)
                else:
                    x = ''
            elif 'triệu' in x and 'tỷ' not in x: # process 'XXX trieu'
                found = re.search(price_pattern, x)
                if found:
                    try:
                        price = float(found.group().replace(',', '.'))

                        x = str(price / 1000)
                    except Exception:
                        x = ''
                        pass
                else:
                    x = ''
            else:# process 'XXX ty' and some other cases
                found = re.search(price_pattern, x)
                if found:
                    x = found.group().replace(',', '.')
                else:
                    x = ''
        return x
    
    metric1 = (df.price != '').sum() / df.shape[0]
    
    df.price = df.price.apply(process_price_cell)
    
    new_df = df[['price']].copy()
    
    def extract_price_from_description(description):
        unit_pattern = r'tỷ|tỷ|tỉ|TỶ|Tỷ|Tỷ|Tỉ|TỈ|ty|Ty|TY'        
        first_price_pattern = r'(?:\d+\.{1,2}\d+|\d+,\d+|\d+) *$'
        last_price_pattern = r'^ *(?:\d+\.{1,2}\d+|\d+,\d+|\d+)'

        a = ''
        b = ''
        found_index = 0
        found = False
        
        fdl = re.finditer(unit_pattern, description)
        
        if fdl:
            for fd in fdl:
                found_index = fd.start()

                a = description[max(0, found_index - 15): found_index]
                
                found = re.findall(first_price_pattern, a)

                if found:
                    b = description[found_index + 2: found_index + 2 + 4]

                    if re.search(r'm²|m2|mét vuông', b):
                        # this is price_m2 information
                        return None

                    found = found[-1]

                    found_fraction = re.search(last_price_pattern, b)
                    if found_fraction:
                        found += '.' + found_fraction.group()

                    return found + ' tỷ'
        
        return None

    count_found = 0
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        found = None
        if row.description:
            found = extract_price_from_description(row.description)
            
        if not found and row.postTitle:
            found = extract_price_from_description(row.postTitle)
            
        if found:
            count_found += 1
            new_df.at[i, 'price'] = found

    metric2 = count_found

    df.price = new_df.price.apply(process_price_cell)
    
    metric3 = (df.price != '').sum() / df.shape[0]
    
    print(f"Summary: ")
    print(f"Not null 1: {metric1}")
    print(f"Extracted from description: {metric2}")
    print(f"Not null 2: {metric3}")

    return df


In [7]:
df = origin_df.copy()

In [8]:
new_df = process_price(df)


  0%|          | 0/205732 [00:00<?, ?it/s]

Summary: 
Not null 1: 1.0
Extracted from description: 183902
Not null 2: 1.0


In [9]:
(new_df.price != '').sum() / new_df.shape[0]

1.0

In [10]:
new_df.to_csv('stage_9.csv', index=False)