In [1]:
import getpass

from sqlalchemy import create_engine, inspect

import pandas as pd
import numpy as np

from scipy import stats
import scipy.stats
from scipy.stats import ttest_1samp


In [2]:
password = getpass.getpass()

········


In [3]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/rental_prices_bcn'
engine = create_engine(connection_string)

In [4]:
query = '''SELECT *
    FROM
    rental_data;
    '''

df = pd.read_sql_query(query, engine)
df.head(5)

Unnamed: 0,year,quarter,district_code,district_name,neighborhood_code,neighborhood_name,euros_month,euros_m2_month,num_agreements
0,2014,1,1,Ciutat Vella,1,el Raval,589.55,10.76,356
1,2014,1,1,Ciutat Vella,2,el Barri Gòtic,712.79,10.58,135
2,2014,1,1,Ciutat Vella,3,la Barceloneta,540.71,14.4,130
3,2014,1,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",673.44,11.01,196
4,2014,1,2,Eixample,5,el Fort Pienc,736.09,10.42,203


Generic hypotheses: 
- Rental prices have increased significantly over the last 10 years
- Special events, such as the pandemic, can be detected
- The mean price of rent in Spain is less than the mean price in Barcelona
- It is possible to identify consistent seasonal patterns


## Rental prices have increased significantly over the last 10 years

In [5]:
df_2014 = df[df['year'] == 2014]
df_2023 = df[df['year'] == 2023]


In [6]:
mean_price_2014 = df_2014['euros_month'].mean()
mean_price_2014

648.8260755336617

In [7]:
mean_price_2023 = df_2023['euros_month'].mean()
mean_price_2023

1013.2569008480104

I'm going to use a two-sample t-test for independent samples to compare the two datasets

In [8]:
# Set up the hypothesis
"h0 = 0"  # Assuming no change: The mean monthly rent price in 2023 is equal to the mean in 2014 (null hypothesis)
"h1: mean_price_2023 >= mean_price_2014" # The mean monthly rent price in 2023 is greater than or equal to the mean in 2014

# Perform a one-sample t-test
stat, p_value = stats.ttest_ind(a=df_2014['euros_month'], b=df_2023['euros_month'], equal_var=True)

print('Pvalue =  ', p_value)

Pvalue =   1.6124766446457925e-39


Interpretation:
- The p-value is extremely small (1.61e-39), suggesting that the observed change in average rental prices is statistically significant
- Conclusion: With a p-value less than 0.05, I have enough evidence to reject the null hypothesis. This suggests that the average rental prices have increased significantly, supporting my initial (alternative) hypothesis.

In [9]:
# Calculate the percentage change in average rental prices for check
percentage_change = ((mean_price_2023 - mean_price_2014) / mean_price_2014) * 100
print(percentage_change)

56.16772183741278


## There was an special event in 2020 that decrease the num of agreements

In [10]:
# Filter the data for the year 2020
year_data = df[df['year'] == 2020]['num_agreements']

# Filter the data to exclude the pandemic year (2020) and calculate the mean of the number of agreements across all other years
years_data = df[df['year'] != 2020]['num_agreements']

# Set up the hypothesis
"h0 = 0"  # Assuming no change in the mean number of agreements between 2020 and other years (null hypothesis)
"h1: num_agreements_2020 < num_agreements" # The mean number of agreements in 2020 is less than the mean in other years

# Perform a one-sample t-test
stat, p_value = stats.ttest_ind(a=df[df['year'] == 2020]['num_agreements'], b=df[df['year'] != 2020]['num_agreements'], equal_var=True)

print('Pvalue =  ', p_value)

Pvalue =   0.010050966945856863


Interpretation:

- The p-value is less than 0.05 (0.010), then we can reject the null hypothesis and there is statistical evidence to suggest that the mean number of agreements in 2020 is significantly less than the mean in other years

Possible implications or interpretations:
- This result may indicate a significant change in the number of agreements during the year 2020, potentially influenced by external factors such as the pandemic

In [11]:
# Calculate the percentage change in the number of contracts for check
years_data_mean = df[df['year'] != 2020]['num_agreements'].mean()

percentage_change = ((years_data_mean - year_data.mean()) / years_data_mean) * 100
print("Percentage Change: ", percentage_change)


Percentage Change:  14.509564669362382


### The mean price of rent in Spain is less than the mean price in Barcelona

In this point I perform a one-sample t-test to compare the mean monthly rent price in Barcelona for 2022 against a hypothetical population mean of 877 euros

In [12]:
mean_spain = 877

# Set up the hypothesis
"h0: Mean price rent bcn 2022 >= 1000" # The mean monthly rent price in Barcelona for 2022 is greater than or equal to 877 euros null hypothesis)
"h1: Mean price rent 2022 < 1000 "# The mean monthly rent price in Barcelona for 2022 is less than 1000 euros

stat, p_value = ttest_1samp(df_2023['euros_month'], mean_spain, alternative = "less")

print('Pvalue =  ', p_value)

Pvalue =   0.9999999465615678


Interpretation: 
- The p-value is approximately 1.0. This high p-value suggests weak evidence against the null hypothesis
- Based on the provided p-value, there isn't enough statistical evidence to reject the null hypothesis. The mean monthly rent price in Barcelona for 2022 is not significantly less than 1000 euros, according to the given significance level.

In [13]:
# Calculate the mean of prices in Barcelona to check
df_2022 = df[df['year'] == 2022]
mean_price_2022 = df_2022['euros_month'].mean()
mean_price_2022

925.5287043091209

### It is possible to identify consistent seasonal patterns


Finally, I have decided not to conduct tests on season patterns because I believe it would require the use of more specific and suitable test types to address this matter effectively.