In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

# Cargar el dataset
df = pd.read_csv('amazon_products.csv')

# Convertir 'sales_volume' a numérico si es necesario
df['sales_volume'] = pd.to_numeric(df['sales_volume'], errors='coerce')

# 4.1. Distribución de cada variable
# 4.1.1. Variables categóricas
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    counts = df[col].value_counts()
    fig = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Number of Observations'})
    fig.update_layout(title=f'Distribution of {col}')
    fig.show()

# 4.1.2. Variables numéricas
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    # Histograma
    fig = px.histogram(df, x=col, nbins=50, labels={'x': col})
    fig.update_layout(title=f'Histogram of {col}')
    fig.show()

    # Outliers
    mean = df[col].mean()
    std_dev = df[col].std()
    outliers = df[np.abs(df[col] - mean) > 5 * std_dev]
    print(f'Outliers for {col}:')
    print(outliers)

    # Prueba de normalidad
    k2, p = stats.normaltest(df[col].dropna())
    print(f'Normality test for {col}: p-value = {p}')

# 4.2. Gráfico de la relación con 'sales_volume'
# 4.2.1. Variables categóricas
for col in categorical_cols:
    fig = px.box(df, x=col, y='sales_volume', labels={'x': col, 'y': 'Sales Volume'})
    fig.update_layout(title=f'Boxplot of Sales Volume by {col}')
    fig.show()

# 4.2.2. Variables numéricas
for col in numeric_cols:
    fig = px.scatter(df, x=col, y='sales_volume', labels={'x': col, 'y': 'Sales Volume'})
    fig.update_layout(title=f'Scatter Plot of Sales Volume vs {col}')
    fig.show()

# 4.3. Matriz de correlación
correlation_matrix = df.corr()
fig = px.imshow(correlation_matrix, text_auto=True, color_continuous_scale='RdBu_r', aspect='auto')
fig.update_layout(title='Correlation Matrix')
fig.show()

# 4.3.1. Variables más importantes para explicar la variabilidad de sales_volume
correlation_with_sales = correlation_matrix['sales_volume']
print('Correlation with sales_volume:')
print(correlation_with_sales)

# 4.3.2. Crear variables dummy y calcular la matriz de correlación nuevamente
df_dummies = pd.get_dummies(df, drop_first=True)
correlation_matrix_dummies = df_dummies.corr()
fig = px.imshow(correlation_matrix_dummies, text_auto=True, color_continuous_scale='RdBu_r', aspect='auto')
fig.update_layout(title='Correlation Matrix with Dummy Variables')
fig.show()

# Variable categórica con mayor correlación
max_corr_categorical = correlation_matrix_dummies.filter(like='_').abs().max().idxmax()
print(f'Variable categórica con mayor correlación: {max_corr_categorical}')


ModuleNotFoundError: No module named 'plotly'

In [None]:
pip install pandas numpy plotly scipy statsmodels
