In [1]:
import torch
from torch.utils.data import Dataset
import numpy as np
from torch.utils.data import DataLoader
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [63]:
import pandas as pd
train12_data = pd.read_csv('train.csv')
submit = pd.read_csv('sample_submission.csv')


In [64]:
import pandas as pd
import itertools

# Helper function to calculate consecutive zeros
def calculate_consecutive_zeros(s):
    return max((len(list(group)) for val, group in itertools.groupby(s) if val == 0), default=0)

# Function to create features for each product with additional features
def create_features_for_product_v2(product_id, product_data):
    product_features = []
    for i in range(len(product_data) - 28):
        window_data = product_data.iloc[i:i+28]
        features = {
            'ID': product_id,
            'date': window_data['date'].iloc[-1],
            '1_28_day_sales': window_data['sales'].sum(),
            '1_28_day_sales_change': window_data['sales'].iloc[-1] - window_data['sales'].iloc[0],
            '1_28_day_total_sales': window_data['sales'].sum(),
            '1_28_day_zero_count': (window_data['sales'] == 0).sum(),
            '1_28_day_max_consecutive_zeros': calculate_consecutive_zeros(window_data['sales']),
            '1_21_day_moving_avg': window_data['sales'].iloc[-21:].mean(),
            '1_21_day_moving_median': window_data['sales'].iloc[-21:].median(),
            '1_21_day_moving_std': window_data['sales'].iloc[-21:].std(),
            'day_of_week': window_data['date'].iloc[-1].dayofweek,
            'weekend': 1 if window_data['date'].iloc[-1].dayofweek in [5, 6] else 0
        }
        
        # Adding 1_day_ago_sales, 2_day_ago_sales, ..., 28_day_ago_sales features
        for j in range(1, 29):
            features[f'{j}_day_ago_sales'] = window_data['sales'].iloc[-j]
        
        # Adding 28_to_27_day_ago_sales_diff, 27_to_26_day_ago_sales_diff, ..., 2_to_1_day_ago_sales_diff features
        for j in range(28, 1, -1):
            features[f'{j}_to_{j-1}_day_ago_sales_diff'] = window_data['sales'].iloc[-j] - window_data['sales'].iloc[-(j-1)]
        
        # Adding targets (future 1-21 day sales)
        future_21_day_sales = product_data['sales'].iloc[i+28:i+49].values
        for j, target in enumerate(future_21_day_sales, start=1):
            features[f'target_{j}'] = target
        
        product_features.append(features)
    return product_features

# Function to create features for a single day (used for test set)
def create_features_for_single_day(product_data):
    window_data = product_data.iloc[-28:]
    features = {
        '1_28_day_sales': window_data['sales'].sum(),
        '1_28_day_sales_change': window_data['sales'].iloc[-1] - window_data['sales'].iloc[0],
        '1_28_day_total_sales': window_data['sales'].sum(),
        '1_28_day_zero_count': (window_data['sales'] == 0).sum(),
        '1_28_day_max_consecutive_zeros': calculate_consecutive_zeros(window_data['sales']),
        '1_21_day_moving_avg': window_data['sales'].iloc[-21:].mean(),
        '1_21_day_moving_median': window_data['sales'].iloc[-21:].median(),
        '1_21_day_moving_std': window_data['sales'].iloc[-21:].std(),
        'day_of_week': window_data['date'].iloc[-1].dayofweek,
        'weekend': 1 if window_data['date'].iloc[-1].dayofweek in [5, 6] else 0
    }

    # Adding 1_day_ago_sales, 2_day_ago_sales, ..., 28_day_ago_sales features
    for j in range(1, 29):
        features[f'{j}_day_ago_sales'] = window_data['sales'].iloc[-j]

    # Adding 28_to_27_day_ago_sales_diff, 27_to_26_day_ago_sales_diff, ..., 2_to_1_day_ago_sales_diff features
    for j in range(28, 1, -1):
        features[f'{j}_to_{j-1}_day_ago_sales_diff'] = window_data['sales'].iloc[-j] - window_data['sales'].iloc[-(j-1)]

    return features

def prepare_train_test_data(train_data):
    # Melting the sales data to have one row per day per product
    melted_sales_data = train_data.melt(id_vars=['ID', '제품', '대분류', '중분류', '소분류', '브랜드'], 
                                        var_name='date', value_name='sales')
    melted_sales_data['date'] = pd.to_datetime(melted_sales_data['date'])

    # List to collect product features for training
    all_product_features = []

    # Applying the feature creation function to each product in the melted data (excluding 2023-04-04)
    for product_id, product_data in melted_sales_data[melted_sales_data['date'] < '2023-04-04'].groupby('ID'):
        product_features = create_features_for_product_v2(product_id, product_data)
        all_product_features.extend(product_features)

    # Convert the list of dictionaries to a DataFrame for training
    train_df = pd.DataFrame(all_product_features)

    # Merging with the original data to get the additional product information for training
    train_df = train_df.merge(train_data[['ID', '제품', '대분류', '중분류', '소분류', '브랜드']].drop_duplicates(), on='ID', how='left')

    # Create test set for 2023-04-04
    test_features = []
    for product_id, product_data in melted_sales_data.groupby('ID'):
        # Selecting the last 28 days including 2023-04-04
        window_data = product_data.iloc[-28:]
        single_day_features = create_features_for_single_day(window_data)
        single_day_features['ID'] = product_id
        single_day_features['date'] = '2023-04-04'
        test_features.append(single_day_features)

    # Convert the list of dictionaries to a DataFrame for testing
    test_df = pd.DataFrame(test_features)

    # Merging with the original data to get the additional product information for testing
    test_df = test_df.merge(train_data[['ID', '제품', '대분류', '중분류', '소분류', '브랜드']].drop_duplicates(), on='ID', how='left')

    return train_df, test_df

# Prepare the training and test data
train_df, test_df = prepare_train_test_data(train12_data)

# Displaying the first few rows of the training and test data
train_df.head(), test_df.head()


MemoryError: 

In [None]:
train_df = train_df.dropna()
train_df

Unnamed: 0,ID,date,1_28_day_sales,1_28_day_sales_change,1_28_day_total_sales,1_28_day_zero_count,1_28_day_max_consecutive_zeros,1_21_day_moving_avg,1_21_day_moving_median,1_21_day_moving_std,...,target_17,target_18,target_19,target_20,target_21,제품,대분류,중분류,소분류,브랜드
0,1,2022-01-28,1,1,1,27,27,0.047619,0.0,0.218218,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
1,1,2022-01-29,2,1,2,26,26,0.095238,0.0,0.300793,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
2,1,2022-01-30,3,1,3,25,25,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
3,1,2022-01-31,3,0,3,25,24,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
4,1,2022-02-01,3,0,3,25,23,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,2,2023-03-09,0,0,0,28,28,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
836,2,2023-03-10,0,0,0,28,28,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
837,2,2023-03-11,0,0,0,28,28,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
838,2,2023-03-12,0,0,0,28,28,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002


In [None]:
train_df.drop(['제품','대분류','중분류','소분류','브랜드'],axis=1,inplace=True)
test_df.drop(['제품','대분류','중분류','소분류','브랜드'],axis=1,inplace=True)

In [None]:
from tqdm.notebook import tqdm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Function to train and predict for a single target column
def train_predict_single_target(train_features, train_target, test_features):
    # Train-test split
    dtrain = lgb.Dataset(train_features, label=train_target)
    params = {
        'objective': 'regression',
        'metric': 'mse',
        'verbosity': -1,
    }
    model = lgb.train(params, dtrain, 100)
    predictions = model.predict(test_features)
    return predictions

# Selecting the features for training
train_features = train_df.drop(['ID', 'date'] + [f'target_{i}' for i in range(1, 22)], axis=1)
test_features = test_df.drop(['ID', 'date'], axis=1)

# Predictions DataFrame to collect all predictions
predictions_df = pd.DataFrame()
predictions_df['ID'] = test_df['ID']

# Training and predicting for each future day (1 to 21)
for i in tqdm(range(1, 22), desc='Predicting for future days'):
    target_column = f'target_{i}'
    train_target = train_df[target_column]
    predictions = train_predict_single_target(train_features, train_target, test_features)
    predictions_df[f'2023-04-{i+4:02d}'] = predictions

# Merging with the sample submission to ensure correct format
final_submission = submit[['ID']].merge(predictions_df, on='ID', how='left')
final_submission.head()


In [None]:
# Replacing negative values with 0 and rounding all values
final_submission.iloc[:, 1:] = final_submission.iloc[:, 1:].applymap(lambda x: round(max(0, x)))

# Merging with the sample submission to ensure correct format
final_submission = submit[['ID']].merge(predictions_df, on='ID', how='left')
final_submission.head()


In [None]:
final_submission.to_csv('ML_test1.csv',index=False)

In [61]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Function to train and predict for a single target column
def train_predict_single_target(train_features, train_target, test_features):
    # Train-test split
    dtrain = lgb.Dataset(train_features, label=train_target)
    params = {
        'objective': 'regression',
        'metric': 'mse',
        'verbosity': -1,
    }
    model = lgb.train(params, dtrain, 100)
    predictions = model.predict(test_features)
    return predictions

# Selecting the features for training
train_features = train_df.drop(['ID', 'date'] + [f'target_{i}' for i in range(1, 22)], axis=1)
test_features = test_df.drop(['ID', 'date'], axis=1)

# Predictions DataFrame to collect all predictions
predictions_df = pd.DataFrame()
predictions_df['ID'] = test_df['ID']

# Training and predicting for each future day (1 to 21)
for i in range(1, 22):
    target_column = f'target_{i}'
    train_target = train_df[target_column]
    predictions = train_predict_single_target(train_features, train_target, test_features)
    predictions_df[f'2023-04-{i+4:02d}'] = predictions

# Merging with the sample submission to ensure correct format
final_submission = submit[['ID']].merge(predictions_df, on='ID', how='left')
final_submission.head()


Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,0.148702,0.275333,0.917341,0.438457,1.341073,-0.046835,0.961902,1.023224,0.814738,...,0.36081,-0.260019,-0.355058,0.872212,0.168857,1.375324,1.337829,1.18919,0.465059,-0.539207
2,2,0.310192,0.413102,0.541925,0.445826,0.444468,0.31134,0.544407,0.93503,1.258861,...,0.922013,0.688389,1.08784,1.414225,1.595011,1.312915,1.156542,1.03523,0.839387,1.221253
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


In [29]:
import pandas as pd
import itertools

# Helper function to calculate consecutive zeros
def calculate_consecutive_zeros(s):
    return max((len(list(group)) for val, group in itertools.groupby(s) if val == 0), default=0)

# Function to create features for each product with additional features
def create_features_for_product_v2(product_id, product_data):
    product_features = []
    for i in range(len(product_data) - 28):
        window_data = product_data.iloc[i:i+28]
        features = {
            'ID': product_id,
            'date': window_data['date'].iloc[-1],
            '1_28_day_sales': window_data['sales'].sum(),
            '1_28_day_sales_change': window_data['sales'].iloc[-1] - window_data['sales'].iloc[0],
            '1_28_day_total_sales': window_data['sales'].sum(),
            '1_28_day_zero_count': (window_data['sales'] == 0).sum(),
            '1_28_day_max_consecutive_zeros': calculate_consecutive_zeros(window_data['sales']),
            '1_21_day_moving_avg': window_data['sales'].iloc[-21:].mean(),
            '1_21_day_moving_median': window_data['sales'].iloc[-21:].median(),
            '1_21_day_moving_std': window_data['sales'].iloc[-21:].std(),
            'day_of_week': window_data['date'].iloc[-1].dayofweek,
            'weekend': 1 if window_data['date'].iloc[-1].dayofweek in [5, 6] else 0
        }
        
        # Adding 1_day_ago_sales, 2_day_ago_sales, ..., 28_day_ago_sales features
        for j in range(1, 29):
            features[f'{j}_day_ago_sales'] = window_data['sales'].iloc[-j]
        
        # Adding 28_to_27_day_ago_sales_diff, 27_to_26_day_ago_sales_diff, ..., 2_to_1_day_ago_sales_diff features
        for j in range(28, 1, -1):
            features[f'{j}_to_{j-1}_day_ago_sales_diff'] = window_data['sales'].iloc[-j] - window_data['sales'].iloc[-(j-1)]
        
        # Adding targets (future 1-21 day sales)
        future_21_day_sales = product_data['sales'].iloc[i+28:i+49].values
        for j, target in enumerate(future_21_day_sales, start=1):
            features[f'target_{j}'] = target
        
        product_features.append(features)
    return product_features

# Melting the sales data to have one row per day per product
melted_sales_data12 = train12_data.melt(id_vars=['ID', '제품', '대분류', '중분류', '소분류', '브랜드'], 
                                        var_name='date', value_name='sales')
melted_sales_data12['date'] = pd.to_datetime(melted_sales_data12['date'])

# Creating a DataFrame to hold the final features
final_features_columns12 = ['ID', 'date', '1_28_day_sales', '1_28_day_sales_change', '1_28_day_total_sales', 
                          '1_28_day_zero_count', '1_28_day_max_consecutive_zeros', '1_21_day_moving_avg', 
                          '1_21_day_moving_median', '1_21_day_moving_std', 'day_of_week', 'weekend'] + [f'target_{i}' for i in range(1, 22)]
final_features_df12 = pd.DataFrame(columns=final_features_columns12)

# List to collect product features
all_product_features12_v2 = []

# Applying the feature creation function to each product in the melted data
for product_id, product_data in melted_sales_data12.groupby('ID'):
    product_features = create_features_for_product_v2(product_id, product_data)
    all_product_features12_v2.extend(product_features)

# Convert the list of dictionaries to a DataFrame
final_features_df12_v2 = pd.DataFrame(all_product_features12_v2)

# Merging with the original data to get the additional product information
final_features_df12_v2 = final_features_df12_v2.merge(train12_data[['ID', '제품', '대분류', '중분류', '소분류', '브랜드']].drop_duplicates(), on='ID', how='left')

# Displaying the first few rows
final_features_df12_v2.head()

Unnamed: 0,ID,date,1_28_day_sales,1_28_day_sales_change,1_28_day_total_sales,1_28_day_zero_count,1_28_day_max_consecutive_zeros,1_21_day_moving_avg,1_21_day_moving_median,1_21_day_moving_std,...,target_17,target_18,target_19,target_20,target_21,제품,대분류,중분류,소분류,브랜드
0,1,2022-01-28,1,1,1,27,27,0.047619,0.0,0.218218,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
1,1,2022-01-29,2,1,2,26,26,0.095238,0.0,0.300793,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
2,1,2022-01-30,3,1,3,25,25,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
3,1,2022-01-31,3,0,3,25,24,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
4,1,2022-02-01,3,0,3,25,23,0.142857,0.0,0.358569,...,0.0,0.0,0.0,0.0,0.0,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002


In [30]:
# Function to create features for a single day (used for test set)
def create_features_for_single_day(product_data):
    window_data = product_data.iloc[-28:]
    features = {
        '1_28_day_sales': window_data['sales'].sum(),
        '1_28_day_sales_change': window_data['sales'].iloc[-1] - window_data['sales'].iloc[0],
        '1_28_day_total_sales': window_data['sales'].sum(),
        '1_28_day_zero_count': (window_data['sales'] == 0).sum(),
        '1_28_day_max_consecutive_zeros': calculate_consecutive_zeros(window_data['sales']),
        '1_21_day_moving_avg': window_data['sales'].iloc[-21:].mean(),
        '1_21_day_moving_median': window_data['sales'].iloc[-21:].median(),
        '1_21_day_moving_std': window_data['sales'].iloc[-21:].std(),
        'day_of_week': window_data['date'].iloc[-1].dayofweek,
        'weekend': 1 if window_data['date'].iloc[-1].dayofweek in [5, 6] else 0
    }

    # Adding 1_day_ago_sales, 2_day_ago_sales, ..., 28_day_ago_sales features
    for j in range(1, 29):
        features[f'{j}_day_ago_sales'] = window_data['sales'].iloc[-j]

    # Adding 28_to_27_day_ago_sales_diff, 27_to_26_day_ago_sales_diff, ..., 2_to_1_day_ago_sales_diff features
    for j in range(28, 1, -1):
        features[f'{j}_to_{j-1}_day_ago_sales_diff'] = window_data['sales'].iloc[-j] - window_data['sales'].iloc[-(j-1)]

    return features

# Create test set for 2023-04-04
test_features_2023_04_04 = []
for product_id, product_data in melted_sales_data12[melted_sales_data12['date'] <= '2023-04-04'].groupby('ID'):
    single_day_features = create_features_for_single_day(product_data)
    single_day_features['ID'] = product_id
    single_day_features['date'] = '2023-04-04'
    test_features_2023_04_04.append(single_day_features)

test_df_2023_04_04 = pd.DataFrame(test_features_2023_04_04)

# Merging with the original data to get the additional product information
test_df_2023_04_04 = test_df_2023_04_04.merge(train12_data[['ID', '제품', '대분류', '중분류', '소분류', '브랜드']].drop_duplicates(), on='ID', how='left')

# Create training set excluding 2023-04-04
train_df_excluding_2023_04_04 = final_features_df12_v2[final_features_df12_v2['date'] < '2023-04-04']

# Displaying the first few rows of the test set
test_df_2023_04_04.head()


Unnamed: 0,1_28_day_sales,1_28_day_sales_change,1_28_day_total_sales,1_28_day_zero_count,1_28_day_max_consecutive_zeros,1_21_day_moving_avg,1_21_day_moving_median,1_21_day_moving_std,day_of_week,weekend,...,4_to_3_day_ago_sales_diff,3_to_2_day_ago_sales_diff,2_to_1_day_ago_sales_diff,ID,date,제품,대분류,중분류,소분류,브랜드
0,8,0,8,24,21,0.380952,0.0,0.86465,1,0,...,0,-2,2,1,2023-04-04,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002
1,0,0,0,28,28,0.0,0.0,0.0,1,0,...,0,0,0,2,2023-04-04,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002


In [19]:
future_predictions_by_id

{0: array([0.21332073, 0.12340939, 0.20749056, 0.2702606 , 0.24365175,
        0.28887573, 0.20540333, 0.3280228 , 0.35516036, 0.30956155,
        0.41412935, 0.36331022, 0.35734424, 0.35320085, 0.37792277,
        0.4274581 , 0.40100732, 0.4611228 , 0.4885825 , 0.5402522 ,
        0.5643487 ], dtype=float32),
 1: array([1.039295  , 0.87748766, 0.88476104, 0.7042916 , 0.7000917 ,
        0.52954316, 0.63303256, 0.4686969 , 0.41959673, 0.60071135,
        0.70014846, 0.5271128 , 0.48363966, 0.70143735, 0.77308875,
        0.87690175, 0.9548194 , 1.0957456 , 0.9660872 , 0.9085319 ,
        0.9497354 ], dtype=float32),
 2: array([0.22093344, 0.3963951 , 0.5567422 , 0.7317163 , 0.79051447,
        0.8294132 , 0.79702616, 0.7869375 , 0.76207864, 0.78718865,
        0.87680125, 0.9199629 , 0.99168885, 1.0344228 , 1.0642807 ,
        1.0785923 , 1.1180254 , 1.135438  , 1.130678  , 1.1287845 ,
        1.140805  ], dtype=float32),
 3: array([0.5189915, 0.8967273, 1.3034382, 1.5406865, 1.6212593

In [13]:
# Filling the submission DataFrame with the predicted values for 21 days (converted to float64)
for id_val, predictions in future_predictions_by_id.items():
    rounded_predictions = np.round(predictions).astype(np.float64)
    submit.loc[submit['ID'] == id_val, '2023-04-05':'2023-04-25'] = rounded_predictions.astype(np.float64)

# Displaying the first few rows of the filled submission file
submit.head(20)

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,0.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3,1.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,4,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
5,5,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,7,1.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,...,5.0,5.0,5.0,6.0,6.0,6.0,7.0,7.0,7.0,8.0
8,8,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,1.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,...,3.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0


In [14]:
submit.iloc[:, 1:] = submit.iloc[:, 1:].applymap(lambda x: 0 if x < 0 else x)
submit

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,0.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3,1.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,4,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,4.0,8.0,11.0,12.0,12.0,12.0,12.0,12.0,13.0,...,13.0,13.0,12.0,12.0,12.0,12.0,12.0,12.0,11.0,11.0
15886,15886,3.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0
15887,15887,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
15888,15888,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [15]:
submit = submit.fillna(0)

In [16]:
submit.to_csv('submit_v1_ws7_ks3_fillna0.csv',index=False)
submit

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,0.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3,1.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,4,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,4.0,8.0,11.0,12.0,12.0,12.0,12.0,12.0,13.0,...,13.0,13.0,12.0,12.0,12.0,12.0,12.0,12.0,11.0,11.0
15886,15886,3.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0
15887,15887,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
15888,15888,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0


In [16]:
import json

# File path
file_path = "loss_history.txt"

# Open the file for writing
with open(file_path, 'w') as file:
    # Iterate through the loss_history dictionary
    for id_val, loss_list in loss_history.items():
        # Write the ID and corresponding loss values
        file.write(f"ID: {id_val}, Losses: {', '.join(map(str, loss_list))}\n")
        # Add 5 spaces as a separator
        file.write('     \n')

print(f"Loss history saved to {file_path}")


Loss history saved to loss_history.txt


In [9]:
# Load the loss_history dictionary from a JSON file
with open(file_path, 'r') as file:
    loaded_loss_history = json.load(file)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# PSFA

In [None]:
import pandas as pd
train_df = pd.read_csv('train.csv')
submit = pd.read_csv('sample_submission.csv')

# Class Definitions

class moving_avg(torch.nn.Module):
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = torch.nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class series_decomp(torch.nn.Module):
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        residual = x - moving_mean
        return moving_mean, residual

class LTSF_DLinear(torch.nn.Module):
    def __init__(self, window_size, forecast_size, kernel_size, individual, feature_size):
        super(LTSF_DLinear, self).__init__()
        self.window_size = window_size
        self.forecast_size = forecast_size
        self.decomposition = series_decomp(kernel_size)
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear_Seasonal = torch.nn.ModuleList()
            self.Linear_Trend = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear_Trend.append(torch.nn.Linear(self.window_size, self.forecast_size))
                self.Linear_Trend[i].weight = torch.nn.Parameter((1 / self.window_size) * torch.ones([self.forecast_size, self.window_size]))
                self.Linear_Seasonal.append(torch.nn.Linear(self.window_size, self.forecast_size))
                self.Linear_Seasonal[i].weight = torch.nn.Parameter((1 / self.window_size) * torch.ones([self.forecast_size, self.window_size]))
        else:
            self.Linear_Trend = torch.nn.Linear(self.window_size, self.forecast_size)
            self.Linear_Trend.weight = torch.nn.Parameter((1 / self.window_size) * torch.ones([self.forecast_size, self.window_size]))
            self.Linear_Seasonal = torch.nn.Linear(self.window_size, self.forecast_size)
            self.Linear_Seasonal.weight = torch.nn.Parameter((1 / self.window_size) * torch.ones([self.forecast_size, self.window_size]))

    def forward(self, x):
        trend_init, seasonal_init = self.decomposition(x)
        trend_init, seasonal_init = trend_init.permute(0, 2, 1), seasonal_init.permute(0, 2, 1)
        if self.individual:
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forecast_size], dtype=trend_init.dtype).to(trend_init.device)
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forecast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)
            for idx in range(self.channels):
                trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
                seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])
        else:
            trend_output = self.Linear_Trend(trend_init)
            seasonal_output = self.Linear_Seasonal(seasonal_init)
        x = seasonal_output + trend_output

        ################
        #x = torch.relu(x) # 음수 값 제거
        ################
        return x.permute(0, 2, 1)

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
# Function to reshape the data into a time series format for each ID
def reshape_data(df):
    time_series_data = []
    for idx, row in df.iterrows():
        sales_data = row[6:].values.astype(float)
        time_series_data.append(sales_data)
    return np.array(time_series_data)

# Modified time_slide_df function to work with the current data format
def time_slide_df(data, window_size, forecast_size):
    data_list = []
    dap_list = []
    for idx in range(0, len(data) - window_size - forecast_size + 1):
        x = data[idx:idx + window_size].reshape(window_size, 1)
        y = data[idx + window_size:idx + window_size + forecast_size]
        data_list.append(x)
        dap_list.append(y)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32')

# Function to create DataLoader for each ID
def create_dataloader(data, window_size, forecast_size, batch_size):
    X, Y = time_slide_df(data, window_size, forecast_size)
    ds = Data(X, Y)
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

# Reshape the data
time_series_data = reshape_data(train_df)

# Define the window size, forecast size, and batch size
window_size = 105   # Considering the last 30 days for prediction
forecast_size = 21 # Predicting the next 21 days
batch_size = 128
epoch_count = 777
lr = 0.001
min_delta = 0.001
patience = 10

future_predictions_by_id = {}
loss_history = {}

# Iterate through the data by ID
for idx, (id_val, data) in tqdm(enumerate(zip(train_df["ID"], time_series_data)), total=len(train_df["ID"])):
    # Standardizing the data
    mean_ = np.mean(data)
    std_ = np.std(data)
    standardized_data = (data - mean_) / std_
    individual_loss_history = []
    # Create DataLoader
    train_dl = create_dataloader(standardized_data, window_size, forecast_size, batch_size)
    best_loss = float('inf')
    no_improvement_count = 0

    # Training the model
    DLinear_model = LTSF_DLinear(window_size=window_size, forecast_size=forecast_size, kernel_size=15, individual=False, feature_size=1)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
    for epoch in range(1, epoch_count + 1):
        loss_list = []
        DLinear_model.train()
        for batch_idx, (data, target) in enumerate(train_dl):
            optimizer.zero_grad()
            output = DLinear_model(data)
            loss = criterion(output, target.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
        if((epoch % 10) == 0):
            avg_loss = np.mean(loss_list)
            print(f"Id {idx}, Epoch {epoch}: Loss = {avg_loss}")
            individual_loss_history.append(avg_loss)
            if avg_loss + min_delta < best_loss:
                best_loss = avg_loss
                no_improvement_count = 0
            else:
                no_improvement_count += 1
                if no_improvement_count >= patience:
                    print(f"Early stopping at epoch {epoch} for ID {id_val}")
                    break


    loss_history[id_val] = individual_loss_history

    # Predicting the future 15 days using the last window of data
    last_window_data = torch.tensor(standardized_data[-window_size:]).unsqueeze(0).unsqueeze(-1).float()
    future_prediction = DLinear_model(last_window_data)

    # Converting the prediction back to the original scale
    future_prediction = future_prediction.squeeze().detach().numpy() * std_ + mean_

    # Store the prediction
    future_predictions_by_id[id_val] = future_prediction

# Future predictions for each ID from 2023-04-05 to 2023-04-25
# Filling the submission DataFrame with the predicted values for 21 days (converted to float64)
for id_val, predictions in future_predictions_by_id.items():
    rounded_predictions = np.round(predictions).astype(np.float64)
    submit.loc[submit['ID'] == id_val, '2023-04-05':'2023-04-25'] = rounded_predictions.astype(np.float64)

submit.iloc[:, 1:] = submit.iloc[:, 1:].applymap(lambda x: 0 if x < 0 else x)
submit.to_csv('submit_v1_ws105_ks15.csv',index=False)
submit

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt
best_window_sizes = {}

# Iterate through the data by ID
for idx, (id_val, data) in enumerate(zip(train_df["ID"], time_series_data)):
    # Analyze the autocorrelation function
    plot_acf(data)
    plt.show()

    # Set the window size based on the analysis
    best_window_size = int(input(f"Enter the best window size for ID {id_val}: "))
    best_window_sizes[id_val] = best_window_size
