In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/PAD/projektowa praca domowa/cleaned_data.csv")
df.head()

Unnamed: 0,carat,clarity,color,cut,x_dimension,y_dimension,z_dimension,depth,table,price
0,0.5,if,d,ideal,5.1,5.15,3.2,61.5,56.0,3000.0
1,0.7,vvs2,e,premium,5.7,5.49,3.52,62.0,59.0,4500.0
2,0.5,si2,h,good,4.3,4.31,3.9,62.3,56.0,700.0
3,1.2,if,d,ideal,5.9,6.82,4.2,61.7,58.0,10000.0
4,0.9,i1,j,fair,6.0,5.49,3.7,61.7,56.0,2400.0


In [22]:
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as smf
import patsy
import statsmodels.api as sm
import plotly.express as px

# Wizualizacja rozkładu zmiennych

## Heatmapa

In [23]:
corr_matrix = df.corr()

fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                labels=dict(color="Współczynnik korelacji"),
                x=corr_matrix.columns, y=corr_matrix.columns, width=1000, height=1000)

fig.update_layout(title='Mapa ciepła korelacji między zmiennymi')
fig.show()





## Wykres pudełkowy przed

In [24]:
rows = 2
cols = 5

fig = make_subplots(rows=rows, cols=cols, subplot_titles=df.columns[:10])

for i, column in enumerate(df.columns[:10]):
    box = px.box(df, y=column)

    row = (i // cols) + 1
    col = (i % cols) + 1
    for trace in box.data:
        fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=1000, width=1000, title_text="Box Plots for Each Column")
fig.show()


In [37]:
def remove_outliers(df, column_names):
    """Usuwa wartości odstające z określonych kolumn."""
    for column in column_names:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] > upper_bound, Q3, df[column])
        df[column] = np.where(df[column] < lower_bound, Q1, df[column])
    return df

In [25]:
#y_dimension
Q1_y_dimension = np.percentile(df['y_dimension'], 25, method='linear')
Q3_y_dimension = np.percentile(df['y_dimension'], 75, method='linear')
IQR_y_dimension = Q3_y_dimension - Q1_y_dimension
upper_treshold_y_dimension = Q3_y_dimension + 1.5 * IQR_y_dimension
lower_treshold_y_dimension = Q1_y_dimension - 1.5 * IQR_y_dimension


#z_dimension
Q1_z_dimension = np.percentile(df['z_dimension'], 25, method='linear')
Q3_z_dimension = np.percentile(df['z_dimension'], 75, method='linear')
IQR_z_dimension = Q3_z_dimension - Q1_z_dimension
upper_treshold_z_dimension = Q3_z_dimension + 1.5 * IQR_z_dimension
lower_treshold_z_dimension = Q1_z_dimension - 1.5 * IQR_z_dimension

#depth
Q1_depth = np.percentile(df['depth'], 25, method='linear')
Q3_depth = np.percentile(df['depth'], 75, method='linear')
IQR_depth = Q3_depth - Q1_depth
upper_treshold_depth = Q3_depth + 1.5 * IQR_depth
lower_treshold_depth = Q1_depth - 1.5 * IQR_depth

#price
Q1_price = np.percentile(df['price'], 25, method='linear')
Q3_price = np.percentile(df['price'], 75, method='linear')
IQR_price = Q3_price - Q1_price
upper_treshold_price = Q3_price + 1.5 * IQR_price
lower_treshold_price = Q1_price - 1.5 * IQR_price

# usuwanie wartości odstających
df['y_dimension'] = df['y_dimension'].apply(lambda row: Q3_y_dimension if row > Q3_y_dimension else Q1_y_dimension if row < Q1_y_dimension else row)
df['z_dimension'] = df['z_dimension'].apply(lambda row: Q3_z_dimension if row > Q3_z_dimension else Q1_z_dimension if row < Q1_z_dimension else row)
df['depth'] = df['depth'].apply(lambda row: Q3_depth if row > Q3_depth else Q1_depth if row < Q1_depth else row)
df['price'] = df['price'].apply(lambda row: Q3_price if row > Q3_price else Q1_price if row < Q1_price else row)

## Wykres pudełkowy po usunięciu wartości odstających

In [42]:
rows = 2
cols = 5

fig = make_subplots(rows=rows, cols=cols, subplot_titles=df.columns[:10])

for i, column in enumerate(df.columns[:10]):

    box = px.box(df, y=column)
    row = (i // cols) + 1
    col = (i % cols) + 1

    for trace in box.data:
        fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=1000, width=1000, title_text="Box Plots for Each Column")
fig.show()


## Wykres punktowy zależności ceny od innych zmiennych

In [45]:
cols = 5
rows = 2

fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'price({col})' for col in df.columns])

for i, col in enumerate(df.columns):
    row = (i // cols) + 1
    col_index = (i % cols) + 1
    fig.add_trace(go.Scatter(x=df[col], y=df['price'], mode='markers', name=col), row=row, col=col_index)

fig.update_layout(height=900, width=1200, title_text="Zależności ceny od poszczególnych atrybutów", showlegend=False)

fig.show()


## Histogram-liczebność

In [41]:
cols = 5
rows = 2

fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'{col}' for col in df.columns])

for i, column in enumerate(df.columns):
    row = (i // cols) + 1
    col = (i % cols) + 1
    fig.add_trace(go.Histogram(x=df[column], name=column), row=row, col=col)

fig.update_layout(height=900, width=1000, title_text="Liczebność kategorii dla poszczególnych atrybutów", showlegend=False)
#fig.update_traces(opacity=0.75)

fig.show()


#Model regresji

In [29]:
import plotly.graph_objects as go
from scipy import stats

## Liczenie statystyk dla price ~ z każdym atrybutem
Zwracam uwagę na wartość 'Adj. R-squared', wybieram atrybut dla którego wartość ta, będzie największa.

In [30]:
for column in df.columns:
    if column != 'price':
        formula = f"price ~ {column}"
        model = smf.ols(formula=formula, data=df).fit()

        print(f"Model: price ~ {column}")
        print(model.summary())
        print("\n---\n")

Model: price ~ carat
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.436
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     152.9
Date:                Sat, 03 Feb 2024   Prob (F-statistic):           2.11e-26
Time:                        15:41:25   Log-Likelihood:                -1774.2
No. Observations:                 200   AIC:                             3552.
Df Residuals:                     198   BIC:                             3559.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1459.5125    314.26

## price ~
Zaczynam od znalezienia pojedynczej zmiennej, która najlepiej przewiduje price

In [31]:
# do przechowywania wyników modeli
results = {}

# iteracja przez wszystkie kolumny
for column in df.columns:
    if column != 'price':

        formula = f"price ~ {column}"
        model = smf.ols(formula=formula, data=df).fit()
        # zapisanie w słowniku
        results[column] = model.rsquared_adj

# sortowanie
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

for column, adj_r_squared in sorted_results:
    print(f"Model: price ~ {column}, Adj. R-squared: {adj_r_squared:.4f}")

Model: price ~ x_dimension, Adj. R-squared: 0.7048
Model: price ~ z_dimension, Adj. R-squared: 0.4531
Model: price ~ carat, Adj. R-squared: 0.4329
Model: price ~ y_dimension, Adj. R-squared: 0.2780
Model: price ~ table, Adj. R-squared: 0.0957
Model: price ~ clarity, Adj. R-squared: 0.0737
Model: price ~ cut, Adj. R-squared: 0.0186
Model: price ~ color, Adj. R-squared: 0.0172
Model: price ~ depth, Adj. R-squared: 0.0141


Wybieram X_DIMENSION = 0,70

Szukam następnych zmiennych.

In [13]:
results = {}

for column in df.columns:
    if column not in ['price', 'x_dimension']:
        formula = f"price ~ x_dimension + {column}"
        model = smf.ols(formula=formula, data=df).fit()
        results[f"price ~ x_dimension + {column}"] = model.rsquared_adj

sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

for formula, adj_r_squared in sorted_results:
    print(f"Model: {formula}, Adj. R-squared: {adj_r_squared:.4f}")

Model: price ~ x_dimension + clarity, Adj. R-squared: 0.7403
Model: price ~ x_dimension + z_dimension, Adj. R-squared: 0.7343
Model: price ~ x_dimension + cut, Adj. R-squared: 0.7182
Model: price ~ x_dimension + table, Adj. R-squared: 0.7148
Model: price ~ x_dimension + color, Adj. R-squared: 0.7110
Model: price ~ x_dimension + y_dimension, Adj. R-squared: 0.7091
Model: price ~ x_dimension + depth, Adj. R-squared: 0.7041
Model: price ~ x_dimension + carat, Adj. R-squared: 0.7033


In [14]:
results = {}

for column in df.columns:
    if column not in ['price', 'x_dimension', 'clarity']:
        formula = f"price ~ x_dimension + clarity + {column}"
        model = smf.ols(formula=formula, data=df).fit()
        results[f"price ~ x_dimension + clarity + {column}"] = model.rsquared_adj

sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

for formula, adj_r_squared in sorted_results:
    print(f"Model: {formula}, Adj. R-squared: {adj_r_squared:.4f}")

Model: price ~ x_dimension + clarity + z_dimension, Adj. R-squared: 0.7580
Model: price ~ x_dimension + clarity + cut, Adj. R-squared: 0.7485
Model: price ~ x_dimension + clarity + y_dimension, Adj. R-squared: 0.7448
Model: price ~ x_dimension + clarity + table, Adj. R-squared: 0.7432
Model: price ~ x_dimension + clarity + color, Adj. R-squared: 0.7421
Model: price ~ x_dimension + clarity + carat, Adj. R-squared: 0.7399
Model: price ~ x_dimension + clarity + depth, Adj. R-squared: 0.7392


## Budowa modelu regresji

Budowa modelu regresji ceny od pozostałych zmiennych. Zmienne wybrane selekcją postępującą.

Zaczynam od modelu z jedną zmienną, która ma największy współczynnik (Adj. R-squared) przy przewidywaniu price, a następnie dodaję do modelu kolejne zmienne, które najbardziej poprawiają Adj. R-squared, aż do wykorzystania wszystkich dostępnych atrybutów.

In [39]:
df = remove_outliers(df, ['y_dimension', 'z_dimension', 'depth', 'price'])

In [44]:
    import statsmodels.formula.api as smf

    vars = [col for col in df.columns if col != 'price']

    # Inicjalizacja zmiennych do przechowywania wyników
    best_adj_r_squared = -float("inf")
    best_formula = ""
    model_list = []  # Przechowywanie modeli
    while vars:
        best_current_var = None
        for var in vars:
            # Budowanie formuły z aktualnie najlepszym zestawem zmiennych
            formula = f"price ~ {best_formula} + {var}" if best_formula else f"price ~ {var}"
            model = smf.ols(formula=formula, data=df).fit()
            # Aktualizacja najlepszego modelu, jeśli obecny jest lepszy
            if model.rsquared_adj > best_adj_r_squared:
                best_adj_r_squared = model.rsquared_adj
                best_current_var = var
        # Jeśli znaleziono lepszą zmienną, aktualizujemy najlepszą formułę i kontynuujemy
        if best_current_var:
            best_formula = f"{best_formula} + {best_current_var}" if best_formula else best_current_var
            vars.remove(best_current_var)
            model_list.append((best_formula, best_adj_r_squared))
        else:
            break
    if best_formula:
        best_model_formula = f"price ~ {best_formula}"
        best_model = smf.ols(formula=best_model_formula, data=df).fit()
        print(f"Najlepszy model: {best_model_formula}, Adj. R-squared: {best_adj_r_squared:.4f}")
        print(best_model.summary())
    else:
        print('xd')

Najlepszy model: price ~ x_dimension + clarity + z_dimension + cut + y_dimension + table, Adj. R-squared: 0.7694
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.784
Model:                            OLS   Adj. R-squared:                  0.769
Method:                 Least Squares   F-statistic:                     52.08
Date:                Sat, 03 Feb 2024   Prob (F-statistic):           7.73e-55
Time:                        17:07:07   Log-Likelihood:                -1677.9
No. Observations:                 200   AIC:                             3384.
Df Residuals:                     186   BIC:                             3430.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [18]:
import statsmodels.formula.api as smf

vars = [col for col in df.columns if col != 'price']

# Inicjalizacja zmiennych do przechowywania wyników
best_adj_r_squared = -float("inf")
best_formula = ""

model_list = [] # przechowywanie modeli

while vars:
    best_current_var = None
    for var in vars:
        # Budowanie formuły z aktualnie najlepszym zestawem zmiennych
        formula = f"price ~ {best_formula} + {var}" if best_formula else f"price ~ {var}"
        model = smf.ols(formula=formula, data=df).fit()
        # Aktualizacja najlepszego modelu, jeśli obecny jest lepszy
        if model.rsquared_adj > best_adj_r_squared:
            best_adj_r_squared = model.rsquared_adj
            best_current_var = var

    # Jeśli znaleziono lepszą zmienną, aktualizujemy najlepszą formułę i kontynuujemy
    if best_current_var:
        best_formula = f"{best_formula} + {best_current_var}" if best_formula else best_current_var
        vars.remove(best_current_var)
        model_list.append((best_formula, best_adj_r_squared))
    else:
        break

for formula, adj_r_squared in model_list:
    print(f"Model: price ~ {formula}, Adj. R-squared: {adj_r_squared:.4f}")


Model: price ~ x_dimension, Adj. R-squared: 0.7048
Model: price ~ x_dimension + clarity, Adj. R-squared: 0.7403
Model: price ~ x_dimension + clarity + z_dimension, Adj. R-squared: 0.7581
Model: price ~ x_dimension + clarity + z_dimension + cut, Adj. R-squared: 0.7660
Model: price ~ x_dimension + clarity + z_dimension + cut + y_dimension, Adj. R-squared: 0.7683
Model: price ~ x_dimension + clarity + z_dimension + cut + y_dimension + table, Adj. R-squared: 0.7694


In [32]:
print("Model P Values:", model.pvalues.values)
print("Model Coef:", model.params.values)
print("Model Std Errs:", model.bse.values)

Model P Values: [9.89569686e-01 3.26882644e-03 3.05262902e-01 2.18444620e-01
 1.81335912e-02 2.94533685e-04 4.73196996e-01 4.16203263e-02
 1.69751602e-02 9.73938162e-02 5.60207331e-27 4.94294818e-04
 6.44332461e-02 1.84515935e-01 5.27806501e-01]
Model Coef: [ 494.44002739  968.78892224  309.75996398  328.89233117  691.22968898
 1039.1154963   182.48299876  582.68763703  657.86348712  454.41744194
 1956.78665195 1930.05951145  615.05800336   88.68115388 -380.99715607]
Model Std Errs: [37770.86621776   325.087909     301.30473056   266.33741039
   289.93569667   281.58348817   253.88663182   284.01984909
   273.0807504    272.74978959   154.08628556   544.18781303
   330.62667064    66.58005168   602.31639827]


## Wizualizacja modelu regresji

Fitted values do pierwotnych cen diamentów

In [37]:
import plotly.graph_objects as go

df["fitted"] = model.fittedvalues
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df["fitted"], y=df["price"], mode="markers", name="Rzeczywiste ceny",
    marker=dict(color="RoyalBlue")))

fig.add_trace(go.Scatter(
    x=df["fitted"], y=df["fitted"], mode="lines", name="Model regresji",
    line=dict(color="FireBrick")))

fig.update_layout(title="Linia regresji: fitted vs price",
                  xaxis_title="fitted",
                  yaxis_title="price",
                  legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))

fig.show()
