In [10]:
import pandas as pd
import numpy as np
from scipy import stats

# učitavanje Excel datoteke
df = pd.read_csv("/content/sample_data/podaci.csv")

# ------------------------
# INFO O SKUPU PODATAKA
# ------------------------
print("Osnovne informacije o skupu podataka:\n")
print(df.info())
print("\nOpisna statistika:\n")
print(df.describe(include="all"))

# ------------------------
# MATRICA KORELACIJE
# ------------------------
print("\nMatrica korelacije (Pearson):\n")
corr_matrix = df.corr()
print(corr_matrix)

# ------------------------
# STATISTIČKA ZNAČAJNOST (p-vrijednosti)
# ------------------------
def correlation_with_pvalues(df):
    df_numeric = df.select_dtypes(include=[np.number])  # samo numerički stupci
    cols = df_numeric.columns
    pvals = pd.DataFrame(np.ones((len(cols), len(cols))), columns=cols, index=cols)

    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            r, p = stats.pearsonr(df_numeric[cols[i]].dropna(), df_numeric[cols[j]].dropna())
            pvals.iloc[i, j] = p
            pvals.iloc[j, i] = p
    return pvals

p_values = correlation_with_pvalues(df)
print("\nP-vrijednosti za korelacije:\n")
print(p_values)

# ------------------------
# SPREMANJE U EXCEL
# ------------------------
with pd.ExcelWriter("rezultati_korelacije.xlsx") as writer:
    corr_matrix.to_excel(writer, sheet_name="Korelacija")
    p_values.to_excel(writer, sheet_name="P-vrijednosti")

print("\nRezultati su spremljeni u 'rezultati_korelacije.xlsx'")


Osnovne informacije o skupu podataka:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           3000 non-null   float64
 1   latitude            3000 non-null   float64
 2   housing_median_age  3000 non-null   float64
 3   total_rooms         3000 non-null   float64
 4   total_bedrooms      3000 non-null   float64
 5   population          3000 non-null   float64
 6   households          3000 non-null   float64
 7   median_income       3000 non-null   float64
 8   median_house_value  3000 non-null   float64
dtypes: float64(9)
memory usage: 211.1 KB
None

Opisna statistika:

         longitude    latitude  housing_median_age   total_rooms  \
count  3000.000000  3000.00000         3000.000000   3000.000000   
mean   -119.589200    35.63539           28.845333   2599.578667   
std       1.994936     2.12967           12.

# New Section