In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import category_encoders as ce
# 데이터 불러오기 및 데이터프레임 생성
data = pd.read_csv("C:/Users/Owner/Desktop/diversified_ecommerce_dataset.csv")
data.head()
# 데이터프레임 생성
df = pd.DataFrame(data)
# 데이터 이해하기

# 데이터 요약
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 16 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Product ID          1000000 non-null  object 
 1   Product Name        1000000 non-null  object 
 2   Category            1000000 non-null  object 
 3   Price               1000000 non-null  float64
 4   Discount            1000000 non-null  int64  
 5   Tax Rate            1000000 non-null  int64  
 6   Stock Level         1000000 non-null  int64  
 7   Supplier ID         1000000 non-null  object 
 8   Customer Age Group  1000000 non-null  object 
 9   Customer Location   1000000 non-null  object 
 10  Customer Gender     1000000 non-null  object 
 11  Shipping Cost       1000000 non-null  float64
 12  Shipping Method     1000000 non-null  object 
 13  Return Rate         1000000 non-null  float64
 14  Seasonality         1000000 non-null  object 
 15  Popularity Index

In [3]:
print(df.describe()) # 기술 통계

                Price        Discount        Tax Rate     Stock Level  \
count  1000000.000000  1000000.000000  1000000.000000  1000000.000000   
mean      1005.120742       12.516955       10.002052      250.028536   
std        574.451223        8.539929        3.406026      144.676275   
min         10.000000        0.000000        5.000000        0.000000   
25%        507.860000        5.000000        8.000000      125.000000   
50%       1005.430000       15.000000       10.000000      250.000000   
75%       1502.310000       20.000000       12.000000      375.000000   
max       2000.000000       25.000000       15.000000      500.000000   

        Shipping Cost     Return Rate  Popularity Index  
count  1000000.000000  1000000.000000    1000000.000000  
mean        24.985224       10.492896         49.970211  
std         14.431730        5.484849         29.164875  
min          0.000000        1.000000          0.000000  
25%         12.490000        5.740000         25.000

In [4]:
print(df.isnull().sum())

Product ID            0
Product Name          0
Category              0
Price                 0
Discount              0
Tax Rate              0
Stock Level           0
Supplier ID           0
Customer Age Group    0
Customer Location     0
Customer Gender       0
Shipping Cost         0
Shipping Method       0
Return Rate           0
Seasonality           0
Popularity Index      0
dtype: int64


In [5]:
df = df.drop(columns=['Product ID']) 
df = df.drop(columns=['Supplier ID'])
df = df.drop(columns=['Product Name'])

In [6]:
# 카테고리형 변수 확인
categorical_columns = [ 'Category',
       'Customer Age Group', 'Customer Location', 'Customer Gender',
       'Shipping Method', 'Seasonality']
print("\n카테고리형 변수의 고유값:")
for col in categorical_columns:
    print(f"{col}: {df[col].unique()}")


카테고리형 변수의 고유값:
Category: ['Apparel' 'Electronics' 'Footwear' 'Books' 'Home Appliances']
Customer Age Group: ['35-44' '25-34' '18-24' '55+' '45-54']
Customer Location: ['New York, USA' 'London, UK' 'Tokyo, Japan' 'Paris, France' 'Singapore'
 'Sydney, Australia' 'Phoenix, USA' 'Cape Town, South Africa'
 'Houston, USA' 'Toronto, Canada' 'Chicago, USA' 'Berlin, Germany'
 'Dubai, UAE' 'Mumbai, India' 'Los Angeles, USA']
Customer Gender: ['Male' 'Female' 'Non-Binary']
Shipping Method: ['Standard' 'Overnight' 'Express']
Seasonality: ['Yes' 'No']


In [7]:
# 카테고리형 변수 목록
categorical_columns = [ 'Category',
                       'Customer Age Group', 'Customer Location', 'Customer Gender',
                       'Shipping Method', 'Seasonality']

# 각 컬럼의 고유값 출력
print("\n카테고리형 변수의 고유값:")
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"{col}: {unique_values}")


카테고리형 변수의 고유값:
Category: ['Apparel' 'Electronics' 'Footwear' 'Books' 'Home Appliances']
Customer Age Group: ['35-44' '25-34' '18-24' '55+' '45-54']
Customer Location: ['New York, USA' 'London, UK' 'Tokyo, Japan' 'Paris, France' 'Singapore'
 'Sydney, Australia' 'Phoenix, USA' 'Cape Town, South Africa'
 'Houston, USA' 'Toronto, Canada' 'Chicago, USA' 'Berlin, Germany'
 'Dubai, UAE' 'Mumbai, India' 'Los Angeles, USA']
Customer Gender: ['Male' 'Female' 'Non-Binary']
Shipping Method: ['Standard' 'Overnight' 'Express']
Seasonality: ['Yes' 'No']


In [8]:
# 1. 라벨 인코딩: 여러 컬럼
le_category = LabelEncoder()
df['Category (Label Encoded)'] = le_category.fit_transform(df['Category'])

le_location = LabelEncoder()
df['Customer Location (Label Encoded)'] = le_location.fit_transform(df['Customer Location'])

le_shipping = LabelEncoder()
df['Shipping Method (Label Encoded)'] = le_shipping.fit_transform(df['Shipping Method'])

le_seasonality = LabelEncoder()
df['Seasonality (Label Encoded)'] = le_seasonality.fit_transform(df['Seasonality'])

le_age_group = LabelEncoder()
df['Customer Age Group (Label Encoded)'] = le_age_group.fit_transform(df['Customer Age Group'])

# 2. 이진 인코딩: Customer Gender
if 'Customer Gender' in df.columns:  # 컬럼 존재 여부 확인
    binary_encoder = ce.BinaryEncoder(cols=['Customer Gender'])
    df = binary_encoder.fit_transform(df)
else:
    print("Customer Gender column is missing or already encoded.")

In [9]:
# 원래 문자열 컬럼 삭제
df = df.drop(columns=['Category', 'Customer Age Group', 'Customer Location', 'Shipping Method', 'Seasonality'])

In [10]:
print(df.dtypes)

Price                                 float64
Discount                                int64
Tax Rate                                int64
Stock Level                             int64
Customer Gender_0                       int64
Customer Gender_1                       int64
Shipping Cost                         float64
Return Rate                           float64
Popularity Index                        int64
Category (Label Encoded)                int64
Customer Location (Label Encoded)       int64
Shipping Method (Label Encoded)         int64
Seasonality (Label Encoded)             int64
Customer Age Group (Label Encoded)      int64
dtype: object


In [11]:
# 상관계수 계산 (수치형 변수만 포함)
numeric_columns = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_columns].corr()


In [12]:
# 인기 지수와의 상관관계 확인
popularity_corr = correlation_matrix['Popularity Index'].sort_values(ascending=False)
print("\nPopularity Index와 다른 변수의 상관관계:")
print(popularity_corr)



Popularity Index와 다른 변수의 상관관계:
Popularity Index                      1.000000
Return Rate                           0.001648
Customer Gender_1                     0.001251
Discount                              0.001250
Customer Age Group (Label Encoded)    0.001081
Category (Label Encoded)              0.000716
Tax Rate                              0.000660
Shipping Cost                         0.000519
Stock Level                           0.000303
Customer Gender_0                     0.000275
Shipping Method (Label Encoded)       0.000060
Seasonality (Label Encoded)           0.000042
Price                                -0.000233
Customer Location (Label Encoded)    -0.000696
Name: Popularity Index, dtype: float64


In [13]:
# 데이터 스케일링
def scale_columns(df, columns_to_scale, scaler_type='standard'):
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported scaler_type. Use 'standard' or 'minmax'.")

    scaled_data = scaler.fit_transform(df[columns_to_scale])
    scaled_df = pd.DataFrame(scaled_data, columns=[f"{col}_{scaler_type}_scaled" for col in columns_to_scale])
    df = df.drop(columns=columns_to_scale)  # 기존 컬럼 삭제
    df = pd.concat([df, scaled_df], axis=1)
    return df

In [14]:
# 스케일링 적용
columns_standard = ['Price', 'Stock Level', 'Discount', 'Tax Rate', 'Shipping Cost', 'Return Rate']
# columns_minmax = ['Discount', 'Tax Rate', 'Shipping Cost', 'Return Rate']
df = scale_columns(df, columns_standard, scaler_type='standard')
#df = scale_columns(df, columns_minmax, scaler_type='minmax')

In [15]:
# 스케일링 결과 확인
print("\n스케일링 결과 데이터프레임:")
print(df.head())


스케일링 결과 데이터프레임:
   Customer Gender_0  Customer Gender_1  Popularity Index  \
0                  0                  1                56   
1                  1                  0                79   
2                  1                  1                40   
3                  1                  0                93   
4                  0                  1                56   

   Category (Label Encoded)  Customer Location (Label Encoded)  \
0                         0                                  8   
1                         2                                  5   
2                         3                                 13   
3                         1                                  9   
4                         2                                 13   

   Shipping Method (Label Encoded)  Seasonality (Label Encoded)  \
0                                2                            1   
1                                1                            0   
2                 

In [16]:
# 수치형 데이터만 선택
numeric_columns = df.select_dtypes(include=['number']).columns

# 상관 계수 계산
correlation_matrix = df[numeric_columns].corr()

# Popularity Index와의 상관 계수 추출 및 정렬
popularity_correlation = correlation_matrix['Popularity Index'].sort_values(ascending=False)

# 결과 출력
print("\n--- 'Popularity Index'와의 상관 계수 ---")
print(popularity_correlation)


--- 'Popularity Index'와의 상관 계수 ---
Popularity Index                      1.000000
Return Rate_standard_scaled           0.001648
Customer Gender_1                     0.001251
Discount_standard_scaled              0.001250
Customer Age Group (Label Encoded)    0.001081
Category (Label Encoded)              0.000716
Tax Rate_standard_scaled              0.000660
Shipping Cost_standard_scaled         0.000519
Stock Level_standard_scaled           0.000303
Customer Gender_0                     0.000275
Shipping Method (Label Encoded)       0.000060
Seasonality (Label Encoded)           0.000042
Price_standard_scaled                -0.000233
Customer Location (Label Encoded)    -0.000696
Name: Popularity Index, dtype: float64


In [17]:
# 모델링
# Feature와 Target 분리
X = df.drop(columns=['Popularity Index'])
y = df['Popularity Index']

In [18]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# RandomForest, Ridge, SVR 모델 초기화
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "SVR": SVR(kernel='rbf')
}

In [20]:
# 결과 저장용 딕셔너리
results = {}

In [21]:
print(df.dtypes)

Customer Gender_0                       int64
Customer Gender_1                       int64
Popularity Index                        int64
Category (Label Encoded)                int64
Customer Location (Label Encoded)       int64
Shipping Method (Label Encoded)         int64
Seasonality (Label Encoded)             int64
Customer Age Group (Label Encoded)      int64
Price_standard_scaled                 float64
Stock Level_standard_scaled           float64
Discount_standard_scaled              float64
Tax Rate_standard_scaled              float64
Shipping Cost_standard_scaled         float64
Return Rate_standard_scaled           float64
dtype: object


In [22]:
# 각 모델 학습 및 평가
for model_name, model in models.items():
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 예측
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # 평가
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    # 결과 저장
    results[model_name] = {
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,
        "Train R2": train_r2,
        "Test R2": test_r2
    }

KeyboardInterrupt: 