# Concatenate Datasets

In [1]:
import sqlite3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from daftpy.daftdata import get_db, db_dict, to_datetime, sale_dict_daily, drop_renewed, concatenate_dropping_renewed

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [3]:
!ls data

2021-09-25.db  2021-09-30.db  2021-10-05.db  2021-10-10.db  2021-10-15.db
2021-09-26.db  2021-10-01.db  2021-10-06.db  2021-10-11.db  2021-10-16.db
2021-09-27.db  2021-10-02.db  2021-10-07.db  2021-10-12.db  2021-10-17.db
2021-09-28.db  2021-10-03.db  2021-10-08.db  2021-10-13.db  2021-10-18.db
2021-09-29.db  2021-10-04.db  2021-10-09.db  2021-10-14.db  backup


------------------

-----------

In [4]:
sale_dict = db_dict()
sale_dict.keys()

dict_keys(['2021-10-05', '2021-09-29', '2021-10-16', '2021-10-09', '2021-10-04', '2021-10-01', '2021-09-28', '2021-09-27', '2021-10-06', '2021-10-15', '2021-10-18', '2021-10-03', '2021-09-30', '2021-10-11', '2021-10-08', '2021-10-12', '2021-10-02', '2021-10-14', '2021-10-13', '2021-09-26', '2021-10-10', '2021-10-17', '2021-10-07', '2021-09-25'])

In [5]:
sorted(sale_dict.keys())

['2021-09-25',
 '2021-09-26',
 '2021-09-27',
 '2021-09-28',
 '2021-09-29',
 '2021-09-30',
 '2021-10-01',
 '2021-10-02',
 '2021-10-03',
 '2021-10-04',
 '2021-10-05',
 '2021-10-06',
 '2021-10-07',
 '2021-10-08',
 '2021-10-09',
 '2021-10-10',
 '2021-10-11',
 '2021-10-12',
 '2021-10-13',
 '2021-10-14',
 '2021-10-15',
 '2021-10-16',
 '2021-10-17',
 '2021-10-18']

In [6]:
import collections

sale_dict = collections.OrderedDict(sorted(sale_dict.items())).copy()

sale_dict.keys()

odict_keys(['2021-09-25', '2021-09-26', '2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30', '2021-10-01', '2021-10-02', '2021-10-03', '2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08', '2021-10-09', '2021-10-10', '2021-10-11', '2021-10-12', '2021-10-13', '2021-10-14', '2021-10-15', '2021-10-16', '2021-10-17', '2021-10-18'])

Ahora haremos una sola tabla que contenga todos los datos que necesitamos. Como cada dia se escrapean todos los datos de la web, hay muchos anuncios que tendremos que eliminar. Queremos una tabla en la que aparezcan todos los anuncios nuevos de cada dia, pero no los renovados sin modificacion en el precio. 

Para obtener la tabla deseada tendremos que seguir los pasos siguientes:

1. Filtrar cada dataset de forma que obtengamos solo los anuncios cuyo `entered_renewed` se corresponda con el dia de escrapeo, `scraping_date`. De esta forma tenemos los anuncios renovados o publicados actualizados de cada dia.
2. Eliminar los anuncios renovados, ya que entendemos que no es un nuevo activo disponible para invertir en el. En caso de que el precio del activo cambie al ser renovado si se tendra en cuenta y no sera eliminado. En caso de que el precio del anuncio cambie al ser renovado, se eliminara el anuncio antiguo, ya que deja de reflejar el precio actual.

--------

En primer lugar usamos la funcion `to_datetime` del modulo `daftdata` para convertir las columnas `entered_renewed` y `scraping_date` a `datetime64` type.

In [7]:
to_datetime(sale_dict)

In [8]:
sale_dict['2021-09-25'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16368 entries, 0 to 16367
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   daft_id                       16368 non-null  object        
 1   item_id                       16368 non-null  object        
 2   url                           16368 non-null  object        
 3   name                          16368 non-null  object        
 4   price                         16368 non-null  object        
 5   info                          16368 non-null  object        
 6   sale_type                     16368 non-null  object        
 7   floor_area                    16368 non-null  object        
 8   psr                           16368 non-null  object        
 9   ber                           16368 non-null  object        
 10  entered_renewed               16368 non-null  datetime64[ns]
 11  views                       

In [9]:
sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0] 

datetime.date(2021, 9, 25)

In [10]:
sale_dict['2021-09-25'].groupby('entered_renewed').agg('count')['daft_id'].sort_index(ascending=False).head()

entered_renewed
2021-09-25    2281
2021-09-24    4566
2021-09-23    1255
2021-09-22     781
2021-09-21     475
Name: daft_id, dtype: int64

La web es escrapeada a partir de las 00:02 cada noche, y el proceso dura aproximadamente 7 horas. Al escrapear a las 00:02 lo estamos haciendo con fecha de escrapeo superior en un dia a los anuncios. Sin embargo, puede observarse como un gran numero de anuncios ya tienen fecha de `entered_renewed` igual a la de escrapeo. Probablemente durante el tiempo de escrapeo algunos anuncios han sido automaticamente renovados. Debido a esto, cuando filtremos los anuncios lo haremos trabajando con los dos ultimos dias. 

In [11]:
sale_dict['2021-09-25']['entered_renewed'].dt.date.values#[0]

array([datetime.date(2021, 9, 24), datetime.date(2021, 9, 13),
       datetime.date(2021, 9, 13), ..., datetime.date(2021, 9, 6),
       datetime.date(2021, 9, 16), datetime.date(2021, 9, 22)],
      dtype=object)

In [12]:
sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0]

datetime.date(2021, 9, 25)

In [13]:
from datetime import timedelta

delta = timedelta(days=1)
delta

datetime.timedelta(days=1)

In [14]:
sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0] - delta

datetime.date(2021, 9, 24)

In [15]:
(sale_dict['2021-09-25']['entered_renewed'].dt.date.values == sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0]).sum()

2281

In [16]:
(sale_dict['2021-09-25']['entered_renewed'].dt.date.values == sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0] - delta).sum()

4566

In [17]:
((sale_dict['2021-09-25']['entered_renewed'].dt.date.values == sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0]) +\
(sale_dict['2021-09-25']['entered_renewed'].dt.date.values == sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0] - delta)).sum()

6847

 ############################## Deberia coger todos los anuncios del primer dataset? o al menos mas dias?

Utilizamos la funcion `sale_dict_daily` para crear un dictionary que contenga en cada elemento la fecha de escrapeo (key) y los anuncios publicados o renovados ese dia. Luego llamamos a la funcion `concatenate_dropping_renewed` para concatenar todos los elementos del `daily_dict` dictionary.

In [18]:
from datetime import timedelta


def sale_dict_daily(dictionary):
    delta = timedelta(days=1)
    sale_dict_daily = {}
    for key in dictionary:
        
        scraping_date = dictionary[key]['scraping_date'].dt.date.value_counts().index[0]
        entered_renewed_date = dictionary[key]['entered_renewed'].dt.date.values
        before_scraping_date = entered_renewed_date - delta
        
        sale_dict_daily[key] = dictionary[key][(entered_renewed_date == scraping_date) | 
                                               (entered_renewed_date == scraping_date - delta)].reset_index(drop=True)
    return sale_dict_daily # 2 last days

In [19]:
daily_dict = sale_dict_daily(sale_dict)
daily_dict.keys()

dict_keys(['2021-09-25', '2021-09-26', '2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30', '2021-10-01', '2021-10-02', '2021-10-03', '2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08', '2021-10-09', '2021-10-10', '2021-10-11', '2021-10-12', '2021-10-13', '2021-10-14', '2021-10-15', '2021-10-16', '2021-10-17', '2021-10-18'])

In [20]:
daily_dict['2021-09-25'].shape # bien -> cada dataframe son dos dias

(6847, 17)

In [21]:
def drop_renewed(old_data, new_data):   # necesito acortar el tiempo de ejecucion de esta funcion
    print(f'Shape before dropping: {new_data.shape}')
    for url in new_data['url']:

        condition_1 = old_data['url'].str.contains(url).sum() != 0
        condition_2 = (new_data.loc[
                           new_data['url'].str.contains(url), ['url', 'price']].values ==
                       old_data.loc[old_data['url'].str.contains(url), ['url',
                                                                        'price']].values).all()  # axis=1

        if condition_1 and condition_2:
            index_to_drop = new_data[new_data['url'].str.contains(url)].index[0]
            new_data.drop(index=[index_to_drop], inplace=True)
    print(f'Shape after dropping: {new_data.shape}')
    print('-' * 10)
    return new_data

In [22]:
from datetime import datetime

sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0]

datetime.date(2021, 9, 25)

In [23]:
datetime.strptime('2021-09-25', '%Y-%m-%d').date()

datetime.date(2021, 9, 25)

In [24]:
sale_dict['2021-09-25']['scraping_date'].dt.date.value_counts().index[0] == datetime.strptime('2021-09-25', '%Y-%m-%d').date()

True

In [25]:
(sale_dict['2021-09-25']['entered_renewed'].dt.date.values == datetime.strptime('2021-09-25', '%Y-%m-%d').date()).sum()

2281

In [26]:
#sale_dict['2021-09-25'][sale_dict['2021-09-25']['entered_renewed'].dt.date.values == datetime.strptime('2021-09-25', '%Y-%m-%d').date()]

In [27]:
from datetime import timedelta

delta = timedelta(days=1)

def concatenate_dropping_renewed(initial_key, dictionary):

    full_data = dictionary[initial_key]
    print(f'First full_data: {full_data.shape}\n')
    
    # Filter first day
    delta = timedelta(days=1)
    init_key_date_df = full_data[full_data['entered_renewed'].dt.date.values == datetime.strptime(initial_key, '%Y-%m-%d').date()] # '2021-09-25'
    print(f'init_key_date_df.shape: {init_key_date_df.shape}')
    one_day_before_df = full_data[full_data['entered_renewed'].dt.date.values == datetime.strptime(initial_key, '%Y-%m-%d').date() - delta] # '2021-09-24'
    print(f'one_day_before_df.shape: {one_day_before_df.shape}\n')   
    init_key_date_df = drop_renewed(one_day_before_df, init_key_date_df)
    full_data = pd.concat([init_key_date_df, one_day_before_df], axis=0, ignore_index=True)
    print('---> No hay anuncios repetidos en el primer dataset aunque coja los 2 ultimos dias a pesar de durar 7 horas.\n\n')
    
    
    
    print(f'First full_data after initial filtering: {full_data.shape}')  
    print(f"\nLet's do it! ---------->")
    
    print(f'Initial shape: {full_data.shape}')
    print('-'*30)
    dictionary.pop('2021-09-25')

    for i, key in enumerate(dictionary):
        #if key == '2021-09-29': ####
         #   break
        print(f'Key: {key}')
        data_to_concat = drop_renewed(full_data, dictionary[key])   # esto tarda demasiado -> lo puedo mejorar?
        full_data = pd.concat([data_to_concat, full_data], axis=0, ignore_index=True)
        print(f'Shape after concatenation {i}: {full_data.shape}')
        print('-'*20)
    print(f'Final shape: {full_data.shape}')
    return full_data

In [28]:
sale_data = concatenate_dropping_renewed('2021-09-25', daily_dict)

First full_data: (6847, 17)

init_key_date_df.shape: (2281, 17)
one_day_before_df.shape: (4566, 17)

Shape before dropping: (2281, 17)
Shape after dropping: (2281, 17)
----------
---> No hay anuncios repetidos en el primer dataset aunque coja los 2 ultimos dias a pesar de durar 7 horas.


First full_data after initial filtering: (6847, 17)

Let's do it! ---------->
Initial shape: (6847, 17)
------------------------------
Key: 2021-09-26
Shape before dropping: (4183, 17)
Shape after dropping: (160, 17)
----------
Shape after concatenation 0: (7007, 17)
--------------------
Key: 2021-09-27
Shape before dropping: (3965, 17)
Shape after dropping: (111, 17)
----------
Shape after concatenation 1: (7118, 17)
--------------------
Key: 2021-09-28
Shape before dropping: (7314, 17)
Shape after dropping: (1499, 17)
----------
Shape after concatenation 2: (8617, 17)
--------------------
Key: 2021-09-29
Shape before dropping: (7372, 17)
Shape after dropping: (1149, 17)
----------
Shape after concat

In [29]:
sale_data.sample()

Unnamed: 0,daft_id,item_id,url,name,price,info,sale_type,floor_area,psr,ber,entered_renewed,views,type_house,energy_performance_indicator,coordinates,type,scraping_date
5127,15065671,3260981,https://www.daft.ie/for-sale/detached-house-do...,"Donohill Cross, Donohill, Co. Tipperary","€90,000","1 Bed,1 Bath,100 m²,Detached",For Sale by Private Treaty,100 m²,none,SI_666,2021-10-06,7753,house,none,52.538664+-8.144277,buy,2021-10-07


In [30]:
sale_data.to_csv('data_available/sale_data.csv', sep=',', index=False)

In [31]:
#pd.read_csv('data_available/sale_data.csv', sep=',').shape