In [1]:
import getpass

from sqlalchemy import create_engine, inspect

import pandas as pd
import numpy as np

from scipy import stats
import scipy.stats
from scipy.stats import ttest_1samp


In [2]:
password = getpass.getpass()

········


In [3]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/rental_prices_bcn'
engine = create_engine(connection_string)

In [4]:
query = '''SELECT *
    FROM
    rental_data;
    '''

df = pd.read_sql_query(query, engine)
df.head(5)

Unnamed: 0,year,quarter,district_code,district_name,neighborhood_code,neighborhood_name,euros_month,euros_m2_month,num_agreements
0,2014,1,1,Ciutat Vella,1,el Raval,589.55,10.76,356
1,2014,1,1,Ciutat Vella,2,el Barri Gòtic,712.79,10.58,135
2,2014,1,1,Ciutat Vella,3,la Barceloneta,540.71,14.4,130
3,2014,1,1,Ciutat Vella,4,"Sant Pere, Santa Caterina i la Ribera",673.44,11.01,196
4,2014,1,2,Eixample,5,el Fort Pienc,736.09,10.42,203


Generic hypotheses: 
- Rental prices have increased significantly over the last 10 years
- It is possible to identify consistent seasonal patterns
- Special events, such as the pandemic, can be detected


## Rental prices have increased significantly over the last 10 years

In [5]:
# grouped the dataset by the 'year' and calculate the av rental price for each year 
average_prices_by_year = df.groupby('year')['euros_month'].mean()

# Set up the hypothesis
h0 = 0  # Assuming no change (null hypothesis)
h1 = 50  # Expected mean change based on the hypothesis

# Perform a one-sample t-test
stat, p_value = ttest_1samp(average_prices_by_year, h1)

print('Stat_Score =  ', stat)
print('Pvalue =  ', p_value)

Stat_Score =   21.629378790060944
Pvalue =   4.545854810856916e-09


Interpretation:

- The t-statistic of 21.63 is substantial, indicating a significant deviation from the null hypothesis.
- The p-value is extremely small (4.55e-09), suggesting that the observed change in average rental prices is statistically significant
- Conclusion: With a p-value less than 0.05, I have enough evidence to reject the null hypothesis. This suggests that the average rental prices have increased significantly, supporting my initial (alternative) hypothesis of more than a 50% increase.

In [6]:
# Calculate the percentage change in average rental prices for check
percentage_change = ((average_prices_by_year.iloc[-1] - average_prices_by_year.iloc[0]) / average_prices_by_year.iloc[0]) * 100
percentage_change 

56.167721837412756

In [7]:
confidence_level = 0.95
degrees_freedom = len(average_prices_by_year) - 1  
sample_mean = np.mean(average_prices_by_year)
sample_standard_error = scipy.stats.sem(average_prices_by_year)

# Calculate the confidence interval
confidence_interval = scipy.stats.t.interval(confidence_level, 
                                             degrees_freedom, 
                                             sample_mean, 
                                             sample_standard_error)

print('The sample mean is', sample_mean)
print('Confidence interval is', confidence_interval)
print('Standard error of the sample mean is', sample_standard_error)


The sample mean is 828.3629938238668
Confidence interval is (746.9561518252287, 909.7698358225048)
Standard error of the sample mean is 35.986377666174


The confidence interval (746.96, 909.77) provides a range within which you are 95% confident the true mean change lies. As it doesn't include 0, it supports the evidence from the t-test.

## There was an special event in 2020 that decrease the num of agreements

In [8]:
# Filter the data for the year of the COVID pandemic
pandemic_year_data = df[df['year'] == 2020]['num_agreements']

# Filter the data to exclude the pandemic year (2020) and calculate the mean of the number of agreements across all other years
years_data = df[df['year'] != 2020]['num_agreements']
years_data_mean = years_data.mean()


# Set up the hypothesis
h0_percentage_change = 0  # Assuming no change (null hypothesis)
h1_percentage_change = -10  # Expected mean change based on the hypothesis

# Perform a one-sample t-test
stat_contracts, p_value_contracts = ttest_1samp(pandemic_year_data, h1_percentage_change)

print('Stat_Score =  ', stat_contracts)
print('Pvalue =  ', p_value_contracts)


Stat_Score =   19.08653642554241
Pvalue =   3.167520199307874e-53


Interpretation:

- The t-statistic of 19.08 is substantial, indicating a significant deviation from the null hypothesis.
- The p-value is extremely small (3.16e-53), suggesting that the observed change in the number of agreements is statistically significant.
- Conclusion: With a p-value less than 0.05, there is enough evidence to reject the null hypothesis. This indicates that the number of agreements decreased significantly in 2020, supporting the alternative hypothesis of a more than 10% decrease.

In [9]:
# Calculate the percentage change in the number of contracts for check
percentage_change = ((pandemic_year_data.mean() - years_data_mean) / years_data_mean) * 100
percentage_change 

-14.509564669362382

In [10]:
confidence_level = 0.95
degrees_freedom = len(pandemic_year_data) - 1  
sample_mean = np.mean(pandemic_year_data)
sample_standard_error = scipy.stats.sem(pandemic_year_data)

# Calculate the confidence interval
confidence_interval = scipy.stats.t.interval(confidence_level, 
                                             degrees_freedom, 
                                             sample_mean, 
                                             sample_standard_error)

print('The sample mean is', sample_mean)
print('Confidence interval is', confidence_interval)
print('Standard error of the sample mean is', sample_standard_error)


The sample mean is 133.94520547945206
Confidence interval is (119.10198376106267, 148.78842719784146)
Standard error of the sample mean is 7.541714340943416


### It is possible to identify consistent seasonal patterns

In [11]:
# Group the dataset by 'quarter' and calculate the average number of agreements for each quarter
average_agreements_by_quarter = df.groupby('quarter')['num_agreements'].mean()
print(average_agreements_by_quarter)

# Set up the hypothesis
h0 = 0  # Assuming no change (null hypothesis)
h1 = 10 # Expected mean change based on the hypothesis

# Perform a one-sample t-test
stat, p_value = ttest_1samp(average_agreements_by_quarter, h1)

print('Stat_Score =  ', stat)
print('Pvalue =  ', p_value)


quarter
1    151.082305
2    145.312757
3    158.172256
4    164.821612
Name: num_agreements, dtype: float64
Stat_Score =   34.170499873974435
Pvalue =   5.510352198218045e-05


In [12]:
confidence_level = 0.95
degrees_freedom = len(average_agreements_by_quarter) - 1  
sample_mean = np.mean(average_agreements_by_quarter)
sample_standard_error = scipy.stats.sem(average_agreements_by_quarter)

# Calculate the confidence interval
confidence_interval = scipy.stats.t.interval(confidence_level, 
                                             degrees_freedom, 
                                             sample_mean, 
                                             sample_standard_error)

print('The sample mean is', sample_mean)
print('Confidence interval is', confidence_interval)
print('Standard error of the sample mean is', sample_standard_error)

The sample mean is 154.84723254396755
Confidence interval is (141.35698388678324, 168.33748120115186)
Standard error of the sample mean is 4.238955621901475
