# Technical part

## Load data from csv to pandas dataframe

In [1]:
import pandas as pd

# Load data_categories as dataframe
data_categories = pd.read_csv('.\\data\\raw\\data_categories.csv', index_col=False, delimiter = ';')

# Load data_market_prices as dataframe
data_market_prices = pd.read_csv('.\\data\\raw\\data_market_prices.csv', index_col=False, low_memory=False, delimiter = ',', dtype={'price':'Int64', 'price_per_sqm':'Int64'})

# Load data_vas_purchases as dataframe
data_vas_purchases = pd.read_csv('.\\data\\raw\\data_vas_purchases.csv', index_col=False, delimiter = ';')

## Connect to MySQL server, create new database and necessary tables

In [2]:
import pandas as pd
from sqlalchemy import create_engine

# Credentials to database connection
hostname="localhost"
dbname="mydatabase"
uname="tomek"
pwd="tomek123"

# Connect to server
engine = create_engine("mysql+pymysql://{user}:{pw}@{host}"
				.format(user=uname, pw=pwd, host=hostname))

# Remove database if exist
engine.execute("DROP DATABASE IF EXISTS {db}"
                .format(db=dbname))

# Create database
engine.execute("CREATE DATABASE {db}"
                .format(db=dbname))

# Select new database
engine.execute("USE {db}"
                .format(db=dbname))

# Drop tables if exist
engine.execute("""
					DROP TABLE IF EXISTS data_categories,data_market_prices,data_vas_purchases;
				""")

# Convert dataframes to sql tables                                 
data_categories.to_sql('data_categories', engine, index=False)
data_market_prices.to_sql('data_market_prices', engine, index=False)
data_vas_purchases.to_sql('data_vas_purchases', engine, index=False)

# Rename original columns to save original values and create new columns with date-type data
queries = ("ALTER TABLE data_market_prices RENAME COLUMN date_posted TO date_posted_old;",              # rename date_posted column to date_posted_old
           "ALTER TABLE data_market_prices RENAME COLUMN date_expired TO date_expired_old;",            # rename date_expired column to date_expired_old
           "ALTER TABLE data_market_prices ADD date_posted DATE AFTER date_posted_old;",                # create new date_posted column to store date-type data
           "ALTER TABLE data_market_prices ADD date_expired DATE AFTER date_expired_old;",              # create new date_expired column to store date-type data
           "UPDATE data_market_prices SET date_posted = STR_TO_DATE(date_posted_old,'%%c/%%e/%%Y');",   # fill new column with data 
           "UPDATE data_market_prices SET date_expired = STR_TO_DATE(date_expired_old,'%%c/%%e/%%Y');"  # fill new column with data
            )

for query in queries:
    engine.execute(query)

## Clean "data_market_price" data and export to csv file 

In [3]:
import pandas as pd
from sqlalchemy import create_engine

# Credentials to database connection
hostname="localhost"
dbname="mydatabase"
uname="tomek"
pwd="tomek123"

# Connect to server
engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
				.format(user=uname, pw=pwd, host=hostname, db=dbname))

# Clean data: - omit rows with empty cell in any column
#			  - omit rows with unrealistically low price (<5000)
#             - omit rows with unproperly inserted date values (date_posted is after date_expired)
#             - omit rows out of range February and March 2021 and 2022 (keep only 2-3/2021 and 2-3/2022)
# 			  - omit offers with the same ad_id (assumption: ad_id should be unique within dataset)
#			  - omit offers where price = price_per_sqm (1 sqm properties)
#			  - omit offers price_per_sqm is too low (lower or equal 5 in this case)
clean_data_market_price = pd.read_sql("""
										SELECT DISTINCT(ad_id), city_id, category_id, market, date_posted_old, date_expired_old, price_per_sqm, price, rooms_num
										FROM data_market_prices
										WHERE ad_id IS NOT NULL
											AND city_id IS NOT NULL
											AND category_id IS NOT NULL
											AND market IS NOT NULL
											AND date_posted_old IS NOT NULL
											AND date_expired_old IS NOT NULL
											AND price IS NOT NULL
											AND price >= 5000
											AND rooms_num IS NOT NULL
											AND price_per_sqm IS NOT NULL
											AND DATEDIFF(date_expired, date_posted) >= 0
											AND ((date_posted >= '2021-02-01' AND date_posted < '2021-04-01') OR (date_posted >= '2022-02-01' AND date_posted < '2022-04-01'))
											AND price != price_per_sqm
											AND price_per_sqm > 5
										GROUP BY ad_id;
									""", engine)

# Export clean_data_market_price to csv file (but keep the same format as in initial dataset)
	# Rename original (suffix: _old) date_posted and date_expired columns
clean_data_market_price = clean_data_market_price.rename(columns={'date_posted_old':'date_posted', 'date_expired_old':'date_expired'})
	# Export ready dataframe to csv file
clean_data_market_price.to_csv('./data/processed/clean_data_market_price.csv', index=False, encoding='utf-8', sep=',')

print("CSV file successfully created!")

CSV file successfully created!


# Analytical part

## Analysis of "data_market_prices" dataset

In [4]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="tomek",
  password="tomek123",
  database="mydatabase"
)

mycursor = mydb.cursor()

# Define Common Table Expression (CTE) for clean data_market_prices to improve code readability
# mp: data_Market_Prices table
# cat: data_CATegories table
clean_data_market_price_CTE = """
								WITH clean_data_market_prices AS (SELECT DISTINCT (mp.ad_id), mp.city_id, mp.category_id, cat.category_name, cat.subcategory_name,
																		 mp.market, mp.date_posted, mp.date_expired, mp.price, mp.price_per_sqm, mp.rooms_num
													  			  FROM data_market_prices mp
													  			  INNER JOIN data_categories cat
													 			  ON mp.category_id = cat.category_id
													  			  WHERE mp.ad_id IS NOT NULL
																	AND mp.city_id IS NOT NULL
																	AND mp.category_id IS NOT NULL
																	AND mp.market IS NOT NULL
																	AND mp.date_posted_old IS NOT NULL
																	AND mp.date_expired_old IS NOT NULL
																	AND mp.price IS NOT NULL
																	AND mp.price >= 5000
																	AND mp.rooms_num IS NOT NULL
																	AND mp.price_per_sqm IS NOT NULL
																	AND DATEDIFF(mp.date_expired, mp.date_posted) >= 0
																	AND ((mp.date_posted >= '2021-02-01' AND mp.date_posted < '2021-04-01') OR (mp.date_posted >= '2022-02-01' AND mp.date_posted < '2022-04-01'))
																  	AND mp.price_per_sqm != mp.price
																	AND mp.price_per_sqm > 5
																  GROUP BY mp.ad_id)
							  """

# Check number of rows
mycursor.execute("""
					{CTE}

					SELECT COUNT(*)
					FROM clean_data_market_prices;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Cleaned data_market_prices table consist of 34985 offers from February to March 2021 and 2022

(34985,)


### Analyzes related to data cleansing

In [5]:
# Check if ad_id are unique values
mycursor.execute("""
					{CTE}

					SELECT COUNT(DISTINCT ad_id)
					FROM clean_data_market_prices;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# After cleansing all ad_ids are unique within dataset

(34985,)


In [6]:
# Check lowest prices to detect unrealistacally low prices
mycursor.execute("""
					{CTE}

					SELECT *
					FROM clean_data_market_prices
                    ORDER BY price
					LIMIT 10;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Real estate offers with prices below 5000 seem unreal. Better filter out those entries.

(61506862, 26, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2021, 2, 13), datetime.date(2021, 2, 13), 5000, 100, '2')
(62739850, 1004, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 2, 6), datetime.date(2022, 3, 8), 5000, 59, '2')
(62840215, 26, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 3, 4), datetime.date(2022, 4, 3), 5000, 93, '2')
(62928769, 26, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 3, 29), datetime.date(2022, 3, 29), 5500, 54, '3')
(62837906, 1004, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 3, 3), datetime.date(2022, 3, 18), 6000, 353, '1')
(61458533, 26, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2021, 2, 1), datetime.date(2021, 2, 1), 6200, 69, '4')
(62924387, 26, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 3, 28), datetime.date(2022, 3, 31), 6500, 65, '3')
(62901809, 1004, 101, 'Sale', 'Apartments', 'secondary', datetime.date(2022, 3, 21), datetime.date(2022, 4, 7), 8700, 1

In [7]:
# Check the size of advertised properties and localize outliers
mycursor.execute("""
					{CTE}

					SELECT price, price_per_sqm, price/price_per_sqm DIV 1 as Total_sqms
					FROM clean_data_market_prices
                    ORDER BY Total_sqms DESC
					LIMIT 10;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Offers with size of the property less than 11 sqms (equal to 1 sqm) seem to be errors and should beommited in further analysis
# Offers with price_per_sqm <= 5 (two offers with price_per_sqm 1 and 5) seem unreal and should be ommited in further analysis

(1100000, 253, 4347)
(6500000, 3023, 2150)
(3300000, 1650, 2000)
(3789000, 1895, 1999)
(15000000, 7700, 1948)
(3000000, 1603, 1871)
(950000, 679, 1399)
(1290000, 989, 1304)
(4500000, 3947, 1140)
(750000, 682, 1099)


### Other analyzes

In [8]:
# Check max, min and average price of real estates based on category name
mycursor.execute("""
					{CTE}

					SELECT category_name, MAX(price), MIN(price), FORMAT(AVG(price), 2)
					FROM clean_data_market_prices
					GROUP BY category_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# The most expensive property for sale was appraised for 19024800, the cheapest for 5000 and the average price was 773,058.21.
# The highest rent was 22000, the lowest 8700 and the average was 15,425.00.

('Sale', 19024800, 5000, '773,058.21')
('Rent', 22000, 8700, '15,425.00')


In [9]:
# Check max, min and average price of sale real estates offers based on subcategory name
mycursor.execute("""
					{CTE}

					SELECT subcategory_name, MAX(price), MIN(price), FORMAT(AVG(price), 2)
					FROM clean_data_market_prices
                    WHERE category_name = 'Sale'
					GROUP BY subcategory_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# The most expensive Apartment for sale was appraised for 19024800, the cheapest for 5000 and the average price was 701,116.43.
# The most expensive House for sale was appraised for 17500000, the cheapest for 29700 and the average price was 1,804,402.53.

('Apartments', 19024800, 5000, '701,116.43')
('Houses', 17500000, 29700, '1,804,402.53')


In [10]:
# What differences do you see between the advertisement types in the data available? (what are the mean prices among real estate categories?)

mycursor.execute("""
					{CTE}

					SELECT category_name, MAX(price), MIN(price), AVG(price) DIV 1, subcategory_name
					FROM clean_data_market_prices
                    GROUP BY category_name, subcategory_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# For Apartments subcategory the max, min and average prices are 19024800, 5000, 701116 respectively for 'Sale' offers and 22000, 8700, 15425 for 'Rent' offers.
# For Apartments subcategory the max, min and average prices are 17500000, 29700, 1804402 respectively and 'Rent' offers are not present within dataset.

('Sale', 19024800, 5000, 701116, 'Apartments')
('Rent', 22000, 8700, 15425, 'Apartments')
('Sale', 17500000, 29700, 1804402, 'Houses')


In [11]:
# Check max, min and average price of sqare meter for sell offers grouped by subcategory name
mycursor.execute("""
					{CTE}

					SELECT subcategory_name, MAX(price_per_sqm), MIN(price_per_sqm), FORMAT(AVG(price_per_sqm), 2)
					FROM clean_data_market_prices
                    WHERE category_name = 'Sale'
					GROUP BY subcategory_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# For advertised Apartments the higest price per sqm was 241095 the lowest 54 and average was 12,177.27.
# For advertised Houses the higest price per sqm was 85059 the lowest 253 and average was 8,128.33.

('Apartments', 241095, 54, '12,177.27')
('Houses', 85059, 253, '8,128.33')


In [12]:
# Check real estate ads distribution based on the city_id
mycursor.execute("""
					{CTE}

					SELECT city_id, COUNT(*)
					FROM clean_data_market_prices
					GROUP BY city_id;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# The ads cover to two cities: with id 26 and 1004. The first one covers more offers (near 6 tmes more, 29442 ads) than the other one (5543 ads). 

(26, 29442)
(1004, 5543)


In [13]:
# Count real estate ads distribution based on the category_name column
mycursor.execute("""
					{CTE}

					SELECT category_name, COUNT(*)
					FROM clean_data_market_prices
          			GROUP BY category_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# The majority of the offers (34981 out of 34985) are related to sale of the real estate ; only a few (4) are rent offers.

('Sale', 34981)
('Rent', 4)


In [14]:
# Count real estate ads distribution based on the subcategory_name column
mycursor.execute("""
					{CTE}

					SELECT subcategory_name, COUNT(*)
					FROM clean_data_market_prices
          			GROUP BY subcategory_name;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# The majority (32704 out of 34985) of the advertised real estates are Apartments, 2281 are Houses.

('Apartments', 32704)
('Houses', 2281)


In [15]:
# Count real estate ads distribution based on the market, subcategory and posted date (year)
mycursor.execute("""
					{CTE}

					SELECT market, subcategory_name, DATE_FORMAT(date_posted, '%m'), DATE_FORMAT(date_posted, '%Y'), COUNT(*)
					FROM clean_data_market_prices
                    WHERE subcategory_name = 'Houses'
          			GROUP BY market, subcategory_name, DATE_FORMAT(date_posted, '%Y'), DATE_FORMAT(date_posted, '%m');
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 

('secondary', 'Houses', '03', '2022', 580)
('secondary', 'Houses', '02', '2022', 495)
('primary', 'Houses', '03', '2022', 267)
('primary', 'Houses', '02', '2022', 311)
('secondary', 'Houses', '02', '2021', 220)
('primary', 'Houses', '02', '2021', 191)
('primary', 'Houses', '03', '2021', 114)
('secondary', 'Houses', '03', '2021', 103)


In [16]:
# Count real estate ads distribution based on the market column
mycursor.execute("""
					{CTE}

					SELECT market, COUNT(*)
					FROM clean_data_market_prices
                    WHERE DATE_FORMAT(date_posted, '%Y') = 2021
          			GROUP BY market;
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# In both years (2021 and 2022) more offers were related to secondary market. 

('primary', 3677)
('secondary', 6101)


In [17]:
# Check real estate ads distribution based on the date_posted column
mycursor.execute("""
					{CTE}

					SELECT DATE_FORMAT(date_posted, '%Y/%m'), COUNT(*)
					FROM clean_data_market_prices 
                    GROUP BY YEAR(date_posted), MONTH(date_posted);
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 72% ads were posted in 2022 which is more than twice as much as in according months in 2021.
# Offers from 2022 are nearly equally distributed between Fabruary and March.
# Offers from 2021 covers mainly Fabruary (74% offers).

('2022/03', 13206)
('2022/02', 12001)
('2021/02', 7243)
('2021/03', 2535)


In [18]:
# Check in which year more offers expired
mycursor.execute("""
					{CTE}

					          SELECT DATE_FORMAT(date_expired, '%Y'), COUNT(*)
					          FROM clean_data_market_prices
                    		  GROUP BY YEAR(date_expired);
				""".format(CTE=clean_data_market_price_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# More than twice as many offers expired in 2022 compared to 2021 despite only half of the year 2022 passed.
# Data should be intereprete with caution, because expired offer doesn't mean the real estate was successfully sold/rented.

('2022', 25911)
('2021', 9074)


In [19]:
# Gather amount of posted offers for each day of each month

posted_by_day = pd.read_sql("""
								{CTE}
					        	SELECT DATE_FORMAT(date_posted, '%%Y') AS Year, DATE_FORMAT(date_posted, '%%m') AS Month, DATE_FORMAT(date_posted, '%%d') AS Day, COUNT(*) AS Amount
					       	 	FROM clean_data_market_prices
								GROUP BY YEAR(date_posted), MONTH(date_posted), DAY(date_posted);
							""".format(CTE=clean_data_market_price_CTE), engine)

# Export to CSV for further use
posted_by_day.to_csv('./data/processed/posted_by_day.csv', index=False, encoding='utf-8', sep=';')

print("CSV file successfully created!")

# OUTPUT COMMENT
# 

CSV file successfully created!


In [20]:
# Gather amount of avg price per sqm for each day of each month

avg_price_per_sqm_by_day = pd.read_sql("""
										{CTE}
					        			SELECT DATE_FORMAT(date_posted, '%%Y') AS Year, DATE_FORMAT(date_posted, '%%m') AS Month, DATE_FORMAT(date_posted, '%%d') AS Day, AVG(price_per_sqm) DIV 1 AS AVG_price_per_sqm
					        			FROM clean_data_market_prices
                            			WHERE subcategory_name = 'Apartments'
										GROUP BY YEAR(date_posted), MONTH(date_posted), DAY(date_posted);
									""".format(CTE=clean_data_market_price_CTE), engine)

# Export to CSV for further use
avg_price_per_sqm_by_day.to_csv('./data/processed/avg_price_per_sqm_by_day.csv', index=False, encoding='utf-8', sep=';')

print("CSV file successfully created!")

# OUTPUT COMMENT
# 

CSV file successfully created!


### Data structure of "data_market_prices" dataset analysis using pandas package

In [21]:
from IPython.display import display

display(clean_data_market_price.describe(include='all'))

market_prices_with_categories = pd.merge(clean_data_market_price, data_categories,on='category_id',how='left')
display(market_prices_with_categories.describe(include='all'))


Unnamed: 0,ad_id,city_id,category_id,market,date_posted,date_expired,price_per_sqm,price,rooms_num
count,34985.0,34985.0,34985.0,34985,34985,34985,34985.0,34985.0,34985.0
unique,,,,2,117,281,,,11.0
top,,,,secondary,3/22/2022,6/20/2022,,,2.0
freq,,,,21807,1285,748,,,13193.0
mean,62468180.0,180.953666,107.520051,,,,11911.94915,772971.6,
std,584579.2,357.124197,24.688057,,,,4617.425333,707729.8,
min,61455070.0,26.0,101.0,,,,54.0,5000.0,
25%,61610440.0,26.0,101.0,,,,8900.0,450000.0,
50%,62789420.0,26.0,101.0,,,,11455.0,600000.0,
75%,62863860.0,26.0,101.0,,,,14000.0,825000.0,


Unnamed: 0,ad_id,city_id,category_id,market,date_posted,date_expired,price_per_sqm,price,rooms_num,category_name,subcategory_name
count,34985.0,34985.0,34985.0,34985,34985,34985,34985.0,34985.0,34985.0,34985,34985
unique,,,,2,117,281,,,11.0,2,2
top,,,,secondary,3/22/2022,6/20/2022,,,2.0,Sale,Apartments
freq,,,,21807,1285,748,,,13193.0,34981,32704
mean,62468180.0,180.953666,107.520051,,,,11911.94915,772971.6,,,
std,584579.2,357.124197,24.688057,,,,4617.425333,707729.8,,,
min,61455070.0,26.0,101.0,,,,54.0,5000.0,,,
25%,61610440.0,26.0,101.0,,,,8900.0,450000.0,,,
50%,62789420.0,26.0,101.0,,,,11455.0,600000.0,,,
75%,62863860.0,26.0,101.0,,,,14000.0,825000.0,,,


## Analysis of “data_vas_purchases” and “data_categories” datasets

In [22]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="tomek",
  password="tomek123",
  database="mydatabase"
)

mycursor = mydb.cursor()

# Define Common Table Expression (CTE) for clean data_market_prices to improve code readability
# pur: data_vas_PURchases table
# cat: data_CATegories table
joined_data_vas_purchases_CTE = """
								    WITH joined_data_vas_purchases AS (SELECT pur.date_day, pur.ad_id, pur.city_id, pur.category_id, pur.amount_spend_to_promote_ads, cat.category_name, cat.subcategory_name
													  			  FROM data_vas_purchases pur
													  			  INNER JOIN data_categories cat
													 			  ON pur.category_id = cat.category_id)
							  	"""

# Check number of rows
mycursor.execute("""
					{CTE}

					SELECT COUNT(*)
					FROM joined_data_vas_purchases;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Joined data_vas_purchases table consist of 9458 entries.

(9458,)


In [23]:
# Check how much was spent for promotional producs
mycursor.execute("""
					{CTE}

					SELECT SUM(amount_spend_to_promote_ads)
					FROM joined_data_vas_purchases;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Total of 248,901 PLN were spent for promotional producs.

(248901.72999998904,)


In [24]:
# Check which real estate category is the most promoted one - amount of promotions
mycursor.execute("""
					{CTE}

					SELECT category_name, COUNT(*)
					FROM joined_data_vas_purchases
					GROUP BY category_name;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 6626 ad promotions were bought for Sale offers and 2832 for Rent offers.

('Rent', 2832)
('Sale', 6626)


In [25]:
# Check which real estate category is the most promoted one - value of promotions
mycursor.execute("""
					{CTE}

					SELECT category_name, SUM(amount_spend_to_promote_ads) DIV 1
					FROM joined_data_vas_purchases
					GROUP BY category_name;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Sale offers were promoted for 184.423 PLN whereas Rent offers for 64.477 PLN in summary.

('Rent', 64477)
('Sale', 184423)


In [26]:
# Check which real estate subcategory is the most promoted one - amount of promotions
mycursor.execute("""
					{CTE}

					SELECT subcategory_name, COUNT(*)
					FROM joined_data_vas_purchases
					GROUP BY subcategory_name;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 8875 ad promotions were bought for Apartments offers and 583 for Houses offers.

('Apartments', 8875)
('Houses', 583)


In [27]:
# Check which real estate subcategory is the most promoted one - value of promotions
mycursor.execute("""
					{CTE}

					SELECT subcategory_name, SUM(amount_spend_to_promote_ads) DIV 1
					FROM joined_data_vas_purchases
					GROUP BY subcategory_name;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# Apartments offers were promoted for 231.842 PLN whereas Rent offers for 17.077 PLN in summary.

('Apartments', 231824)
('Houses', 17077)


In [28]:
# Check in which city are  the most pormoted real estate - amount of promotions
mycursor.execute("""
					{CTE}

					SELECT city_id, COUNT(*)
					FROM joined_data_vas_purchases
					GROUP BY city_id;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 7870 (83%) ad promotions were bought for city_id 26 offers and 1588 for city_id 1004 offers.

(26, 7870)
(1004, 1588)


In [29]:
# Check in which city are  the most pormoted real estate - total value
mycursor.execute("""
					{CTE}

					SELECT city_id, SUM(amount_spend_to_promote_ads) DIV 1
					FROM joined_data_vas_purchases
					GROUP BY city_id;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 212,991 PLN was spent for promotional product of advertisements releted to city 26.

(26, 212991)
(1004, 35910)


In [30]:
# Check when promotional producs were most popular (amount)
mycursor.execute("""
					{CTE}

					SELECT category_name, DATE_FORMAT(date_day, '%Y'), DATE_FORMAT(date_day, '%m'), COUNT(*)
					FROM joined_data_vas_purchases
					GROUP BY YEAR(date_day), MONTH(date_day), category_name
					ORDER BY category_name, date_day;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 

('Rent', '2021', '02', 892)
('Rent', '2021', '03', 1063)
('Rent', '2022', '02', 452)
('Rent', '2022', '03', 425)
('Sale', '2021', '02', 1828)
('Sale', '2021', '03', 2216)
('Sale', '2022', '02', 1133)
('Sale', '2022', '03', 1449)


In [31]:
# Check when promotional producs were most popular (amount)
mycursor.execute("""
					{CTE}

					SELECT DATE_FORMAT(date_day, '%Y'), DATE_FORMAT(date_day, '%m'), COUNT(*)
					FROM joined_data_vas_purchases
					GROUP BY YEAR(date_day), MONTH(date_day)
					ORDER BY  date_day;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 

('2021', '02', 2720)
('2021', '03', 3279)
('2022', '02', 1585)
('2022', '03', 1874)


In [32]:
# Check when promotional producs were most popular (total value)
mycursor.execute("""
					{CTE}

					SELECT DATE_FORMAT(date_day, '%Y'), DATE_FORMAT(date_day, '%m'), SUM(amount_spend_to_promote_ads) DIV 1
					FROM joined_data_vas_purchases
					GROUP BY YEAR(date_day), MONTH(date_day)
					ORDER BY date_day;
				""".format(CTE=joined_data_vas_purchases_CTE))

# Get output
myresult = mycursor.fetchall()

# Print output
for x in myresult:
  print(x)

# OUTPUT COMMENT
# 

('2021', '02', 69526)
('2021', '03', 88921)
('2022', '02', 41007)
('2022', '03', 49446)
