In [58]:
import pandas as pd

df = pd.read_csv('cleaned_diamond_data.csv').rename(columns = {'Unnamed: 0' : 'id'})
df.drop('id',inplace=True,axis=1)
df.head(5)

Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table,price
0,0.5,IF,D,IDEAL,5.1,5.15,3.2,61.5,56.0,3000.0
1,0.7,VVS2,E,PREMIUM,5.7,5.49,3.52,62.0,59.0,4500.0
2,0.5,SI2,H,GOOD,4.3,4.31,3.9,62.3,56.0,700.0
3,1.2,IF,D,IDEAL,5.9,6.82,4.2,61.7,58.0,10000.0
4,0.9,I1,J,FAIR,6.0,5.49,3.7,61.7,56.0,2400.0


In [59]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cols = df.columns.tolist()

for column in cols:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [60]:
from mlxtend.feature_selection import SequentialFeatureSelector

X = df.iloc[:, 0:9]
y = df.iloc[:, -1]

In [61]:
X.head()

Unnamed: 0,carat,clarity,color,cut,x dimension,y dimension,z dimension,depth,table
0,0.5,1,1,2,5.1,5.15,3.2,61.5,56.0
1,0.7,5,2,3,5.7,5.49,3.52,62.0,59.0
2,0.5,3,5,1,4.3,4.31,3.9,62.3,56.0
3,1.2,1,1,2,5.9,6.82,4.2,61.7,58.0
4,0.9,0,7,0,6.0,5.49,3.7,61.7,56.0


In [62]:
y

0        3000.0
1        4500.0
2         700.0
3       10000.0
4        2400.0
         ...   
195      2300.0
196     10400.0
197    150000.0
198      6300.0
199      7500.0
Name: price, Length: 200, dtype: float64

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)


In [64]:
from sklearn.ensemble import RandomForestClassifier

forward_feature_selection = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
                                                      k_features = (1,9),
                                                      forward=True,
                                                      floating=False,
                                                      verbose=2,
                                                      scoring = "accuracy",
                                                      cv = 2).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.8s finished

[2024-01-22 19:54:51] Features: 1/9 -- score: 0.12[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.5s finished

[2024-01-22 19:54:54] Features: 2/9 -- score: 0.1[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.0s finished

[2024-01-22 19:54:56] Features: 3/9 -- score: 0.11[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.9s finished

[202

In [65]:
forward_feature_selection.k_feature_names_

('carat', 'clarity', 'color', 'x dimension', 'y dimension', 'table')

In [66]:
forward_feature_selection.k_score_ 

0.14

In [67]:
pd.DataFrame.from_dict(forward_feature_selection.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(4,)","[0.12, 0.12]",0.12,"(x dimension,)",0.0,0.0,0.0
2,"(0, 4)","[0.12, 0.08]",0.1,"(carat, x dimension)",0.086053,0.02,0.02
3,"(0, 4, 5)","[0.1, 0.12]",0.11,"(carat, x dimension, y dimension)",0.043027,0.01,0.01
4,"(0, 1, 4, 5)","[0.12, 0.1]",0.11,"(carat, clarity, x dimension, y dimension)",0.043027,0.01,0.01
5,"(0, 1, 2, 4, 5)","[0.12, 0.1]",0.11,"(carat, clarity, color, x dimension, y dimension)",0.043027,0.01,0.01
6,"(0, 1, 2, 4, 5, 8)","[0.14, 0.14]",0.14,"(carat, clarity, color, x dimension, y dimensi...",0.0,0.0,0.0
7,"(0, 1, 2, 4, 5, 6, 8)","[0.08, 0.14]",0.11,"(carat, clarity, color, x dimension, y dimensi...",0.12908,0.03,0.03
8,"(0, 1, 2, 4, 5, 6, 7, 8)","[0.12, 0.1]",0.11,"(carat, clarity, color, x dimension, y dimensi...",0.043027,0.01,0.01
9,"(0, 1, 2, 3, 4, 5, 6, 7, 8)","[0.06, 0.1]",0.08,"(carat, clarity, color, cut, x dimension, y di...",0.086053,0.02,0.02


In [68]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
from dash import dash_table
import plotly.express as px
import pandas as pd


# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Opcje dla kontrolki wyboru zmiennej
variable_options = ['carat', 'clarity', 'color', 'cut', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']

# Układ strony
app.layout = html.Div(children=[
    html.H1(children='Diamond Data Dashboard'),

    # Kontrolka wyboru zmiennej
    dcc.Dropdown(
        id='variable-dropdown',
        options=[{'label': variable, 'value': variable} for variable in variable_options],
        value='carat',
        multi=False,
        style={'width': '50%'},
        clearable=False
    ),

    # Wizualizacja rozkładu wybranej zmiennej
    dcc.Graph(id='distribution-plot'),

    # Tabela z próbką danych
    html.Div([
        html.H3('Próbka danych'),
        dash_table.DataTable(
            id='sample-data-table',
            columns=[{'name': col, 'id': col} for col in df.columns],
            data=df.sample(10).to_dict('records')
        )
    ])
])

# Funkcja do aktualizacji wykresu i tabeli na podstawie wybranej zmiennej
@app.callback(
    [Output('distribution-plot', 'figure'),
     Output('sample-data-table', 'data')],
    [Input('variable-dropdown', 'value')]
)
def update_plots(selected_variable):
    # Aktualizacja wykresu
    fig = px.histogram(df, x=selected_variable, nbins=50, title=f'Rozkład zmiennej "{selected_variable}"')

    # Aktualizacja tabeli
    sample_data = df.sample(10).to_dict('records')

    return fig, sample_data

# Uruchomienie aplikacji
if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)


The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


In [69]:
import matplotlib.pyplot as plt
import seaborn as sns

# Wizualizacja rozkładu zmiennej
plt.figure(figsize=(10, 6))
sns.histplot(df['carat'], kde=True, color='skyblue')
plt.title('Rozkład zmiennej "carat"')
plt.xlabel('Carat')
plt.ylabel('Liczebność')
plt.show()

# Wizualizacja zależności ceny od innych zmiennych
plt.figure(figsize=(12, 8))
sns.scatterplot(x='carat', y='price', data=df, alpha=0.5)
plt.title('Zależność ceny od carat')
plt.xlabel('Carat')
plt.ylabel('Price')
plt.show()

# Wizualizacja liczebności kategorii
plt.figure(figsize=(10, 6))
sns.countplot(x='cut', data=df, palette='viridis')
plt.title('Liczebność kategorii "cut"')
plt.xlabel('Cut')
plt.ylabel('Liczebność')
plt.show()


ModuleNotFoundError: No module named 'seaborn'

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_table
import plotly.express as px
import pandas as pd
import statsmodels.api as sm

# Wczytaj dane

# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Opcje dla kontrolki wyboru zmiennych
variable_options = ['carat', 'clarity', 'color', 'cut', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']

# Układ strony
app.layout = html.Div(children=[
    html.H1(children='Diamond Data Dashboard'),

    # Kontrolka wyboru zmiennych
    dcc.Dropdown(
        id='regression-variables',
        options=[{'label': variable, 'value': variable} for variable in variable_options],
        value=['carat'],  # Domyślne wartości
        multi=True,
        style={'width': '50%'},
        clearable=False
    ),

    # Wykres regresji
    dcc.Graph(id='regression-plot'),

    # Wyniki regresji w tabeli
    html.Div([
        html.H3('Wyniki regresji'),
        dash_table.DataTable(
            id='regression-results-table',
            columns=[{'name': 'Zmienna', 'id': 'Variable'},
                     {'name': 'Wartość współczynnika', 'id': 'Coefficient'},
                     {'name': 'P-value', 'id': 'P-value'}],
            data=[]
        )
    ])
])

# Funkcja do aktualizacji wykresu regresji i tabeli wyników
@app.callback(
    [Output('regression-plot', 'figure'),
     Output('regression-results-table', 'data')],
    [Input('regression-variables', 'value')]
)
def update_regression(selected_variables):
    # Dodaj stałą do danych
    X = sm.add_constant(df[selected_variables])

    # Dopasuj model regresji
    model = sm.OLS(df['price'], X)
    results = model.fit()

    # Wygeneruj wykres regresji
    fig = px.scatter(x=df[selected_variables[0]], y=results.fittedvalues, labels={'x': selected_variables[0], 'y': 'Predicted Price'})
    fig.update_layout(title=f'Regresja ceny od {selected_variables[0]}', xaxis_title=selected_variables[0], yaxis_title='Predicted Price')

    # Przygotuj dane do tabeli wyników
    results_data = [{'Variable': variable, 'Coefficient': results.params[variable], 'P-value': results.pvalues[variable]} for variable in results.params.index]

    return fig, results_data

# Uruchomienie aplikacji
if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)


In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_table
import plotly.express as px
import pandas as pd
import statsmodels.api as sm

# Wczytaj dane

# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Opcje dla kontrolki wyboru zmiennych
variable_options = ['carat', 'clarity', 'color', 'cut', 'x dimension', 'y dimension', 'z dimension', 'depth', 'table']

# Układ strony
app.layout = html.Div(children=[
    html.H1(children='Diamond Data Dashboard'),

    # Kontrolka wyboru zmiennych
    dcc.Dropdown(
        id='regression-variables',
        options=[{'label': variable, 'value': variable} for variable in variable_options],
        value=['carat'],  # Domyślne wartości
        multi=True,
        style={'width': '50%'},
        clearable=False
    ),

    # Wykres regresji
    dcc.Graph(id='regression-plot'),

    # Wyniki regresji w tabeli
    html.Div([
        html.H3('Wyniki regresji'),
        dash_table.DataTable(
            id='regression-results-table',
            columns=[{'name': 'Zmienna', 'id': 'Variable'},
                     {'name': 'Wartość współczynnika', 'id': 'Coefficient'},
                     {'name': 'P-value', 'id': 'P-value'}],
            data=[]
        )
    ])
])

# Funkcja do aktualizacji wykresu regresji i tabeli wyników
@app.callback(
    [Output('regression-plot', 'figure'),
     Output('regression-results-table', 'data')],
    [Input('regression-variables', 'value')]
)
def update_regression(selected_variables):
    # Dodaj stałą do danych
    X = sm.add_constant(df[selected_variables])

    # Dopasuj model regresji
    model = sm.OLS(df['price'], X)
    results = model.fit()

    # Wygeneruj wykres regresji
    fig = px.scatter(x=df[selected_variables[0]], y=results.fittedvalues, labels={'x': selected_variables[0], 'y': 'Predicted Price'})
    fig.update_layout(title=f'Regresja ceny od {selected_variables[0]}', xaxis_title=selected_variables[0], yaxis_title='Predicted Price')

    # Przygotuj dane do tabeli wyników
    results_data = [{'Variable': variable, 'Coefficient': results.params[variable], 'P-value': results.pvalues[variable]} for variable in results.params.index]

    return fig, results_data

# Uruchomienie aplikacji
if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)
