In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('data/df_with_pca.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55033 entries, 0 to 55032
Data columns (total 52 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   id_istat                                                         55033 non-null  int64  
 1   id_comune                                                        55032 non-null  object 
 2   cap                                                              55033 non-null  int64  
 3   Provincia                                                        55033 non-null  object 
 4   Regione                                                          55033 non-null  object 
 5   Ripartizione Geografica                                          55033 non-null  object 
 6   id_catastale                                                     55033 non-null  object 
 7   Latitudine                              

Remember to convert the "object" variables into numeric so we can study relationships also in between these values

In [7]:
# show me the "Media Costo alla Vendita" column
print(df['Media Costo alla Vendita'])

0         525.0
1         650.0
2         875.0
3        1250.0
4         525.0
          ...  
55028    4100.0
55029    3150.0
55030     800.0
55031     825.0
55032     875.0
Name: Media Costo alla Vendita, Length: 55033, dtype: float64


In [4]:
# Split the data into training and test sets
train, test = train_test_split(df, test_size=0.2, random_state=1)


In [5]:
# Parameters estimation with statsmodels
model1 = smf.ols('Q("Media Costo alla Vendita") ~ Q("Indice di occupazione") + PC_Economica', train).fit()
model1.summary()


0,1,2,3
Dep. Variable:,"Q(""Media Costo alla Vendita"")",R-squared:,0.306
Model:,OLS,Adj. R-squared:,0.306
Method:,Least Squares,F-statistic:,9711.0
Date:,"Thu, 10 Apr 2025",Prob (F-statistic):,0.0
Time:,16:52:56,Log-Likelihood:,-375920.0
No. Observations:,44025,AIC:,751800.0
Df Residuals:,44022,BIC:,751900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-699.6722,48.747,-14.353,0.000,-795.217,-604.128
"Q(""Indice di occupazione"")",60.2670,1.047,57.582,0.000,58.216,62.318
PC_Economica,178.9352,1.590,112.555,0.000,175.819,182.051

0,1,2,3
Omnibus:,24516.795,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,255745.003
Skew:,2.507,Prob(JB):,0.0
Kurtosis:,13.69,Cond. No.,386.0


In [16]:
# Parameters estimation with statsmodels
model1 = smf.ols('Q("Media Costo alla Locazione") ~ PC_Turistica_1 + PC_Turistica_2 + PC_Turistica_3 + PC_Turistica_4 + PC_Turistica_5 + PC_Turistica_6 + PC_Turistica_7 + PC_Turistica_8', train).fit()
model1.summary()


0,1,2,3
Dep. Variable:,"Q(""Media Costo alla Locazione"")",R-squared:,0.558
Model:,OLS,Adj. R-squared:,0.557
Method:,Least Squares,F-statistic:,6933.0
Date:,"Wed, 09 Apr 2025",Prob (F-statistic):,0.0
Time:,16:51:48,Log-Likelihood:,-114180.0
No. Observations:,44026,AIC:,228400.0
Df Residuals:,44017,BIC:,228500.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.2328,0.015,468.847,0.000,7.203,7.263
PC_Turistica_1,0.4887,0.004,115.045,0.000,0.480,0.497
PC_Turistica_2,0.8791,0.006,158.608,0.000,0.868,0.890
PC_Turistica_3,-0.1338,0.007,-19.520,0.000,-0.147,-0.120
PC_Turistica_4,0.8597,0.007,115.236,0.000,0.845,0.874
PC_Turistica_5,-0.4675,0.010,-48.899,0.000,-0.486,-0.449
PC_Turistica_6,-0.2688,0.010,-27.383,0.000,-0.288,-0.250
PC_Turistica_7,0.1173,0.011,10.248,0.000,0.095,0.140
PC_Turistica_8,0.2120,0.012,17.472,0.000,0.188,0.236

0,1,2,3
Omnibus:,9511.431,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51921.568
Skew:,0.935,Prob(JB):,0.0
Kurtosis:,7.981,Cond. No.,3.63


In [19]:
# Parameters estimation with statsmodels
model1 = smf.ols('Q("Costo Massimo alla Vendita") ~ Q("Indice di spopolamento")', train).fit()
model1.summary()


0,1,2,3
Dep. Variable:,"Q(""Costo Massimo alla Vendita"")",R-squared:,0.04
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,1841.0
Date:,"Wed, 09 Apr 2025",Prob (F-statistic):,0.0
Time:,16:56:37,Log-Likelihood:,-389590.0
No. Observations:,44025,AIC:,779200.0
Df Residuals:,44023,BIC:,779200.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2429.1029,8.038,302.194,0.000,2413.348,2444.858
"Q(""Indice di spopolamento"")",46.6669,1.088,42.905,0.000,44.535,48.799

0,1,2,3
Omnibus:,18503.31,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108257.838
Skew:,1.947,Prob(JB):,0.0
Kurtosis:,9.622,Cond. No.,7.39
