In [37]:
import pandas as pd

df = pd.read_csv('cleaned_diamond_data.csv').rename(columns = {'Unnamed: 0' : 'id'})
df.drop('id',inplace=True,axis=1)
df.head(5)

Unnamed: 0,carat,clarity,color,cut,x,y,z,depth,table,price
0,0.5,IF,D,IDEAL,5.1,5.15,3.2,61.5,56.0,3000.0
1,0.7,VVS2,E,PREMIUM,5.7,5.49,3.52,62.0,59.0,4500.0
2,0.5,SI2,H,GOOD,4.3,4.31,3.9,62.3,56.0,700.0
3,1.2,IF,D,IDEAL,5.9,6.82,4.2,61.7,58.0,10000.0
4,0.9,I1,J,FAIR,6.0,5.49,3.7,61.7,56.0,2400.0


In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cols = df.columns.tolist()

for column in cols:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error

# Podział na zbiór treningowy i testowy
y = df['price']
X = df.drop(columns=['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Budowa modelu regresji
model = LinearRegression()

# Eliminacja wsteczna - RFE (Recursive Feature Elimination)
selector = RFE(model, step=1)
selector = selector.fit(X_train, y_train)

# Wybrane istotne zmienne
selected_features = X_train.columns[selector.support_]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Dopasowanie modelu na wybranych zmiennych
model.fit(X_train_selected, y_train)

# Predykcja na zbiorze testowym
y_pred = model.predict(X_test_selected)

# Ocena modelu
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

print("Współczynniki modelu:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef}")


In [43]:
import statsmodels.formula.api as smf

model = smf.ols(formula="price ~ carat + C(clarity) + C(color) +  C(cut) + table + z + y + x ", data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.19
Model:,OLS,Adj. R-squared:,0.095
Method:,Least Squares,F-statistic:,1.992
Date:,"Mon, 29 Jan 2024",Prob (F-statistic):,0.00851
Time:,19:29:51,Log-Likelihood:,-2379.9
No. Observations:,200,AIC:,4804.0
Df Residuals:,178,BIC:,4876.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.181e+05,1.32e+05,-1.647,0.101,-4.79e+05,4.31e+04
C(clarity)[T.IF],-8943.8253,1.12e+04,-0.797,0.426,-3.11e+04,1.32e+04
C(clarity)[T.SI1],-1.369e+04,1.03e+04,-1.325,0.187,-3.41e+04,6698.477
C(clarity)[T.SI2],-354.3201,9572.965,-0.037,0.971,-1.92e+04,1.85e+04
C(clarity)[T.VVS1],-9044.0733,1.03e+04,-0.880,0.380,-2.93e+04,1.12e+04
C(clarity)[T.VVS2],-2950.8978,1.01e+04,-0.293,0.770,-2.28e+04,1.69e+04
C(color)[T.D],3.557e+04,1.72e+04,2.073,0.040,1714.123,6.94e+04
C(color)[T.E],-449.1752,1.65e+04,-0.027,0.978,-3.31e+04,3.22e+04
C(color)[T.F],6300.9570,1.59e+04,0.397,0.692,-2.5e+04,3.76e+04

0,1,2,3
Omnibus:,232.76,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7462.153
Skew:,4.854,Prob(JB):,0.0
Kurtosis:,31.305,Cond. No.,2860.0


In [10]:
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from dash import dash_table
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Wczytanie danych
df = pd.read_csv('cleaned_diamond_data.csv').rename(columns={'Unnamed: 0': 'id'})
df.drop('id', inplace=True, axis=1)

# Label Encoding dla danych kategorycznych
le = LabelEncoder()
cols = df.columns.tolist()
for column in cols:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

# Podział na zbiór treningowy i testowy
y = df['price']
X = df.drop(columns=['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Budowa modelu regresji
model = LinearRegression()

# Eliminacja wsteczna - RFE (Recursive Feature Elimination)
selector = RFE(model, step=1)
selector = selector.fit(X_train, y_train)

# Wybrane istotne zmienne
selected_features = X_train.columns[selector.support_]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Dopasowanie modelu na wybranych zmiennych
model.fit(X_train_selected, y_train)

# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Opcje dla kontrolki wyboru kategorii
category_options = ['Rozkład zmiennych', 'Zależność ceny od innych zmiennych', 'Liczebność kategorii', 'Wizualizacja modelu regresji']

# Opcje dla kontrolki wyboru zmiennej
variable_options = ['carat', 'clarity', 'color', 'cut', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']

# Układ strony
app.layout = html.Div(children=[
    html.H1(children='Diamond Data Dashboard'),

    # Kontrolka wyboru kategorii
    dcc.Dropdown(
        id='category-dropdown',
        options=[{'label': category, 'value': category} for category in category_options],
        value='Rozkład zmiennych',
        multi=False,
        style={'width': '50%'},
        clearable=False
    ),

    # Kontrolka wyboru zmiennej
    dcc.Dropdown(
        id='variable-dropdown',
        options=[{'label': variable, 'value': variable} for variable in variable_options],
        value='carat',
        multi=False,
        style={'width': '50%'},
        clearable=False
    ),

    # Wykres
    dcc.Graph(id='visualization-plot'),

    # Tabela z próbką danych
    html.Div([
        html.H3('Próbka danych'),
        dash_table.DataTable(
            id='sample-data-table',
            columns=[{'name': col, 'id': col} for col in df.columns],
            data=df.sample(10).to_dict('records')
        )
    ])
])


# Funkcja do aktualizacji wykresu i tabeli na podstawie wybranej kategorii i zmiennej
@app.callback(
    [Output('visualization-plot', 'figure'),
     Output('sample-data-table', 'data')],
    [Input('category-dropdown', 'value'),
     Input('variable-dropdown', 'value')]
)
def update_plots(selected_category, selected_variable):
    if selected_category == 'Rozkład zmiennych':
        # Aktualizacja wykresu - Rozkład zmiennej
        fig = px.histogram(df, x=selected_variable, nbins=50, title=f'Rozkład zmiennej "{selected_variable}"')
    elif selected_category == 'Zależność ceny od innych zmiennych':
        # Aktualizacja wykresu - Zależność ceny od innych zmiennych
        if selected_variable == 'carat':
            fig = px.scatter(x=X_test_selected[selected_variable], y=y_test / X_test_selected[selected_variable],
                             labels={'x': selected_variable, 'y': 'Cena / Carat'},
                             title=f'Zależność ceny od zmiennej "{selected_variable}"',
                             mode='markers', template='plotly_white')
        else:
            y_pred = model.predict(X_test_selected)
            fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Rzeczywista cena', 'y': 'Przewidziana cena'},
                             title='Zależność ceny od innych zmiennych', mode='markers', template='plotly_white')
            fig.add_trace(px.scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)], mode='lines',
                                     name='Idealna zależność', template='plotly_white').data[0])
    elif selected_category == 'Liczebność kategorii':
        # Aktualizacja wykresu - Liczebność kategorii
        fig = px.bar(df[selected_variable].value_counts(), x=df[selected_variable].unique(),
                     title=f'Liczebność kategorii "{selected_variable}"', labels={'x': selected_variable, 'y': 'Liczba'},
                     template='plotly_white')
    elif selected_category == 'Wizualizacja modelu regresji':
        # Aktualizacja wykresu - Wizualizacja modelu regresji
        y_pred = model.predict(X_test_selected)
        fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Rzeczywista cena', 'y': 'Przewidziana cena'},
                         title='Wizualizacja modelu regresji', mode='markers', template='plotly_white')
        fig.add_trace(px.scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)], mode='lines',
                                 name='Idealna zależność', template='plotly_white').data[0])

    # Aktualizacja tabeli
    sample_data = df.sample(10).to_dict('records')

    return fig, sample_data


# Uruchomienie aplikacji
if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)


[1;31m---------------------------------------------------------------------------[0m
[1;31mTypeError[0m                                 Traceback (most recent call last)
[1;31mTypeError[0m: scatter() got an unexpected keyword argument 'mode'

[1;31m---------------------------------------------------------------------------[0m
[1;31mTypeError[0m                                 Traceback (most recent call last)
[1;31mTypeError[0m: scatter() got an unexpected keyword argument 'mode'

[1;31m---------------------------------------------------------------------------[0m
[1;31mTypeError[0m                                 Traceback (most recent call last)
[1;31mTypeError[0m: scatter() got an unexpected keyword argument 'mode'

