# Minh - Furniture

Load data

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

origin_df = pd.read_csv('data/stage_5.csv', low_memory=False)
origin_df = origin_df.fillna('')
origin_df = origin_df.astype('str')


In [2]:
df = origin_df.copy()

In [3]:
def process_furniture(df):
    '''
    Input: data frame
    Output: processed data frame
    '''
    def process_furniture_cell(x):
        if x:
            if x != '0' and x != '1':
                x = '1'
        return x
    
    metric1 = (df.furniture != '').sum() / df.shape[0]
    
    def extract_furniture_from_description(description):
        dl = description.lower()
        positive_keywords = ['nt đầy đủ', 'nt hiện đại', 'nt châu âu', 'nt vip', 'nt cơ bản', 'nt: ', 'nt : ', 'toàn bộ nt', 'ntcc', 'nt sang trọng', 'nt cao cấp', 'tbnt', 'nội thất', 'nt theo nhà', 'full nt', 'nt trong nhà', 'nt tiền tỷ', 'nt tiền tỉ', 'tặng nt']
        negative_keywords = ['không nt', 'không nội thất', 'ko nt']

        value = None
        
        for k in positive_keywords:
            found_index = dl.find(k)

            if found_index != -1:
                value = '1'
                break
        
        has_negative = False
        for k in negative_keywords:
            found_index = dl.find(k)
            
            if found_index != -1:
                has_negative = True
                break
        
        if has_negative:
            value = '0'
        
        return value
    
    df.furniture = df.furniture.apply(process_furniture_cell)
    
    new_df = df[['furniture']].copy()

    count_found = 0
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):        
        if not row.furniture and row.description:
            value = extract_furniture_from_description(row.description)
            
            if value:
                count_found += 1
                new_df.at[i, 'furniture'] = value
    
    
    metric2 = count_found

    df.furniture = new_df.furniture.apply(process_furniture_cell)
    
    metric3 = (df.furniture != '').sum() / df.shape[0]
    
    print(f"Summary: ")
    print(f"Not null 1: {metric1}")
    print(f"Extracted from description: {metric2}")
    print(f"Not null 2: {metric3}")

    return df


In [4]:
df = process_furniture(df)

  0%|          | 0/311837 [00:00<?, ?it/s]

Summary: 
Not null 1: 0.0031394606797782174
Extracted from description: 70589
Not null 2: 0.2295045167828064


In [7]:
df.to_csv('stage_6.csv', index=False)