In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


KeyboardInterrupt: 

In [None]:
#######with leading 0s removed . the none removal version is in the 3rd cell
# Function to reshape and scale the data
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function to train the LSTM model and make forecasts
def forecast_next_values_lstm(data_series, look_back=1):
    # Normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(data_series.reshape(-1, 1))

    # Split into train and test sets
    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]

    # Reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)

    # Reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

    # Create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

    # Make predictions
    trainPredict = model.predict(testX)
    #testPredict = model.predict(testX)

    # Invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform(trainY.reshape(-1, 1))
    #testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform(testY.reshape(-1, 1))

    # Plot the time series for each row
    plt.figure(figsize=(10, 6))
    #plt.plot(np.concatenate([trainPredict.ravel(), testPredict.ravel()]), label='Predicted', color='red')
    plt.plot(np.concatenate([trainY.ravel(),trainPredict.ravel()]), label='Predicted', color='red')
    plt.plot(np.concatenate([trainY.ravel(), testY.ravel()]), label='Actual', color='blue')

    plt.title(f"Row Index: {df.iloc[i, 0]}")
    plt.xlabel("Time")
    plt.ylabel("Sales")
    plt.legend()
    plt.show()

    # Calculate the RMSE for the testing data
    rmse = math.sqrt(mean_squared_error(testY, trainPredict))
    print(f"Testing RMSE for Row Index {df.iloc[i, 0]}: {rmse}")
    
    error = rmse/np.std(data_series)
    print(error)
    return data_series,error

df = pd.read_csv('cleansalesdata.csv')

# Read the data starting from the fifth column
df_clean = df.iloc[:, 5:]

# Extract each row values and store it into a list
list_per_row = [row.tolist() for row in df_clean.values]

# Calculate the percentage to consider for the sum (10%)
percentage_to_sum = 0.1

# Find the number of elements to consider as the last 10%
num_elements_to_sum = int(len(list_per_row[0]) * percentage_to_sum)

# Filter out lists whose last 10% elements sum to 0
list_per_row_filtered = [row for row in list_per_row if sum(row[-num_elements_to_sum:]) != 0]
def remove_leading_zeros(rows):
    new_rows = []
    for row in rows:
        while len(row) > 0 and row[0] == 0:
            row = row[1:]
        new_rows.append(row)
    return new_rows

# Remove leading zeros from each row
list_per_row_filtered_no_zeros = remove_leading_zeros(list_per_row_filtered)

# Iterate over each row of the filtered data
for i, row in enumerate(list_per_row_filtered_no_zeros):
    # Forecast the next values and get the error
    _, error = forecast_next_values_lstm(np.array(row))
    
    # Append error, row index, and the original data to the lists
    errors.append(error)
    row_indexes.append(df.iloc[i, 0])
    original_data.append(row)

# Create a new DataFrame to store errors, row indexes, and original data
error_df = pd.DataFrame({'Row Index': row_indexes, 'Normalized RMSE': errors, 'Original Data': original_data})

# Export the DataFrame to a new CSV file
error_df.to_csv('errors1.csv', index=False)
    

In [None]:
####without leading 0s reomved

# Function to reshape and scale the data
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function to train the LSTM model and make forecasts
def forecast_next_values_lstm(data_series, look_back=1):
    # Normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(data_series.reshape(-1, 1))

    # Split into train and test sets
    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]

    # Reshape into X=t and Y=t+1
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)

    # Reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

    # Create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

    # Make predictions
    trainPredict = model.predict(testX)
    #testPredict = model.predict(testX)

    # Invert predictions
    trainPredict = scaler.inverse_transform(trainPredict)
    trainY = scaler.inverse_transform(trainY.reshape(-1, 1))
    #testPredict = scaler.inverse_transform(testPredict)
    testY = scaler.inverse_transform(testY.reshape(-1, 1))

    # Plot the time series for each row
    plt.figure(figsize=(10, 6))
    
    #plt.plot(np.concatenate([trainPredict.ravel(), testPredict.ravel()]), label='Predicted', color='red')
    plt.plot(np.concatenate([trainY.ravel(),trainPredict.ravel()]), label='Predicted', color='red')
    plt.plot(np.concatenate([trainY.ravel(), testY.ravel()]), label='Actual', color='blue')
    plt.title(f"Row Index: {df.iloc[i, 0]}")
    plt.xlabel("Time")
    plt.ylabel("Sales")
    plt.legend()
    plt.show()

    # Calculate the RMSE for the testing data
    rmse = math.sqrt(mean_squared_error(testY, trainPredict))
    print(f"Testing RMSE for Row Index {df.iloc[i, 0]}: {rmse}")
    
    error = rmse/np.std(data_series)
    print(error)
    return data_series,error

df = pd.read_csv('cleansalesdata.csv')

# Read the data starting from the fifth column
df_clean = df.iloc[:, 5:]

# Extract each row values and store it into a list
list_per_row = [row.tolist() for row in df_clean.values]

# Calculate the percentage to consider for the sum (10%)
percentage_to_sum = 0.1

# Find the number of elements to consider as the last 10%
num_elements_to_sum = int(len(list_per_row[0]) * percentage_to_sum)

# Filter out lists whose last 10% elements sum to 0
list_per_row_filtered = [row for row in list_per_row if sum(row[-num_elements_to_sum:]) != 0]


# Create lists to store errors, row indexes, and original data
errors = []
row_indexes = []
original_data = []

# Iterate over each row of the filtered data
for i, row in enumerate(list_per_row_filtered):
    # Forecast the next values and get the error
    _, error = forecast_next_values_lstm(np.array(row))
    
    # Append error, row index, and the original data to the lists
    errors.append(error)
    row_indexes.append(df.iloc[i, 0])
    original_data.append(row)

# Create a new DataFrame to store errors, row indexes, and original data
error_df = pd.DataFrame({'Row Index': row_indexes, 'Normalized RMSE': errors, 'Original Data': original_data})

# Export the DataFrame to a new CSV file
error_df.to_csv('errors.csv', index=False)