In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

**Import and Read Files**

In [7]:
customer = pd.read_csv("Customer.csv", delimiter=';')
product = pd.read_csv('Product.csv', delimiter = ';')
store = pd.read_csv('Store.csv', delimiter = ';')
transaction = pd.read_csv('Transaction.csv', delimiter = ';')

In [11]:
customer.shape, product.shape, store.shape, transaction.shape

((447, 5), (10, 3), (14, 6), (5020, 8))

In [18]:
customer.info(), product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447 entries, 0 to 446
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerID      447 non-null    int64 
 1   Age             447 non-null    int64 
 2   Gender          447 non-null    int64 
 3   Marital Status  444 non-null    object
 4   Income          447 non-null    object
dtypes: int64(3), object(2)
memory usage: 17.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     10 non-null     object
 1   Product Name  10 non-null     object
 2   Price         10 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes


(None, None)

In [19]:
store.info(), transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   StoreID     14 non-null     int64 
 1   StoreName   14 non-null     object
 2   GroupStore  14 non-null     object
 3   Type        14 non-null     object
 4   Latitude    14 non-null     object
 5   Longitude   14 non-null     object
dtypes: int64(1), object(5)
memory usage: 800.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5020 entries, 0 to 5019
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TransactionID  5020 non-null   object
 1   CustomerID     5020 non-null   int64 
 2   Date           5020 non-null   object
 3   ProductID      5020 non-null   object
 4   Price          5020 non-null   int64 
 5   Qty            5020 non-null   int64 
 6   TotalAmount    5020 non-null   int64 
 7   StoreID        5020 non-n

(None, None)

In [12]:
customer.isnull().sum()

CustomerID        0
Age               0
Gender            0
Marital Status    3
Income            0
dtype: int64

Drop column who has empty value

In [21]:
customer = customer.dropna()

Replace ',' with '.' to be able to be read as float

In [34]:
customer.loc[:,'Income'] = customer['Income'].replace(',','.', regex = True).astype('float')
customer

Unnamed: 0,CustomerID,Age,Gender,Marital Status,Income
0,1,55,1,Married,5.12
1,2,60,1,Married,6.23
2,3,32,1,Married,9.17
3,4,31,1,Married,4.87
4,5,58,1,Married,3.57
...,...,...,...,...,...
441,442,42,1,Married,14.88
443,444,53,0,Married,15.31
444,445,51,0,Married,14.48
445,446,57,0,Married,7.81


In [25]:
product.isnull().sum()

ProductID       0
Product Name    0
Price           0
dtype: int64

In [29]:
product.isnull().sum()

ProductID       0
Product Name    0
Price           0
dtype: int64

In [30]:
store.sample()

Unnamed: 0,StoreID,StoreName,GroupStore,Type,Latitude,Longitude
6,7,Buana Indah,Buana,General Trade,3316694,114590111


In [35]:
store['Latitude'] = store['Latitude'].replace(',','.', regex = True).astype('float')
store['Longitude'] = store['Longitude'].replace(',','.', regex = True).astype('float')
store.sample()

Unnamed: 0,StoreID,StoreName,GroupStore,Type,Latitude,Longitude
8,9,Lingga,Lingga,Modern Trade,-3.654703,128.190643


In [36]:
transaction.isnull().sum()

TransactionID    0
CustomerID       0
Date             0
ProductID        0
Price            0
Qty              0
TotalAmount      0
StoreID          0
dtype: int64

In [38]:
transaction.sample()

Unnamed: 0,TransactionID,CustomerID,Date,ProductID,Price,Qty,TotalAmount,StoreID
1770,TR89483,411,07/05/2022,P10,15000,1,15000,1


In [42]:
transaction['Date'] = pd.to_datetime(transaction['Date'], format = '%d/%m/%Y')
transaction.sample()

Unnamed: 0,TransactionID,CustomerID,Date,ProductID,Price,Qty,TotalAmount,StoreID
4278,TR28308,42,2022-11-08,P6,18000,3,54000,8


In [43]:
df_merge = pd.merge(customer, transaction, on = ['CustomerID'])
df_merge = pd.merge(df_merge, store, on = ['StoreID'])
df_merge = pd.merge(df_merge, product.drop(columns = ['Price']), on = ['ProductID'])
df_merge = df_merge.sort_values(by='Date').reset_index(drop = True)
df_merge.head()

Unnamed: 0,CustomerID,Age,Gender,Marital Status,Income,TransactionID,Date,ProductID,Price,Qty,TotalAmount,StoreID,StoreName,GroupStore,Type,Latitude,Longitude,Product Name
0,328,36,0,Married,10.53,TR11369,2022-01-01,P3,7500,4,30000,12,Prestasi Utama,Prestasi,General Trade,-2.990934,104.756554,Crackers
1,386,33,0,Married,6.95,TR41231,2022-01-01,P9,10000,1,10000,4,Gita Ginara,Gita,General Trade,-6.966667,110.416664,Yoghurt
2,123,34,0,Married,4.36,TR99839,2022-01-01,P2,3200,6,19200,1,Prima Tendean,Prima,Modern Trade,-6.2,106.816666,Ginger Candy
3,283,19,1,Single,0.0,TR51675,2022-01-01,P10,15000,1,15000,5,Bonafid,Gita,General Trade,-7.250445,112.768845,Cheese Stick
4,51,36,0,Married,7.95,TR54287,2022-01-01,P8,16000,2,32000,2,Prima Kelapa Dua,Prima,Modern Trade,-6.914864,107.608238,Oat


**ARIMA**