# Data Mining for Retail Stores
> - ***Ονοματεπώνυμο : Κωνσταντίνος - Ηλίας Χονδρορρίζος***
> - ***Α.Ε.Μ. : 3812***

In [None]:
# Importing the required libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid", palette="dark:#5A9_r")


### Raw data manipulation:
 1. Starting off this project, lets start by loading the data from the csv file.
 2. After that, I am going to drop the requested items,
 3. one hot encoding to the items,
 4. perform differentiation of the basket_value attribute,
 5. and visualize the data using various techniques.

In [None]:
transactions = pd.read_csv("GroceriesInitial.csv")
print(f"Total rows: {len(transactions)}")
transactions.head()

In [None]:
fig, axes = plt.subplots(figsize=(15,5))

sns.scatterplot(ax = axes, data = transactions, x = "recency_days", y = "Item_10", color="indigo")
sns.scatterplot(ax = axes, data = transactions, x = "recency_days", y = "Item_16", color="lime")
plt.legend(["Less than 10 items","More than 15 items"])
plt.yticks([])
plt.ylabel("Items")

fig = plt.figure(figsize=(15,5))
sns.lineplot(data = transactions, x = "recency_days", y = "basket_value")
plt.legend(["Basket cost"])
plt.show()

In [None]:
def isNaN(num):
    return num == num

for index,i in enumerate(transactions["Item_32"]):
    if isNaN(i):
        print(index)
    

In [None]:
PRODUCTS = ["citrus fruit", "tropical fruit", "whole milk", "other vegetables", "rolls/buns", "chocolate", 
            "bottled water", "yogurt", "sausage", "root vegetables", "pastry", "soda", "cream"]
def keep_products(product):
    if product in PRODUCTS:
        return product
    
    return float("nan")

info_cols = transactions.iloc[:,:3]
item_cols = transactions.iloc[:, 3:].applymap(lambda x :keep_products(x))

In [None]:
ids = []

one_hot_items = np.empty([0,13], dtype = np.int8)

for index,row in item_cols.iterrows():
    if row.isnull().sum() == 32:
        ids.append(index)
    else:
        transact = np.zeros(shape=(1, 13) ,dtype = np.int8)
        for i in row.dropna().tolist():
            transact[0,PRODUCTS.index(i)] = 1
        one_hot_items = np.vstack((one_hot_items, transact))
        
items = pd.DataFrame(one_hot_items, columns=PRODUCTS)        

In [None]:
transactions = pd.concat([info_cols.drop(ids).reset_index(drop=True), items], axis=1)
transactions.head()

In [None]:
transactions.describe(percentiles=[.33,.66]).T

In [None]:
fig = plt.figure(figsize=(15,5))
sns.histplot(transactions["basket_value"], kde=True)

plt.show()

In [None]:
print(f"First bucket :  {len(transactions.loc[transactions['basket_value'] < 2.9])}")
print(f"Second bucket: {len(transactions.loc[(transactions['basket_value'] < 6.3) & (transactions['basket_value'] >= 2.9)])}")
print(f"Third bucket :  {len(transactions.loc[transactions['basket_value'] >= 6.3])}")

In [None]:
transactions.loc[transactions['basket_value'] < 2.9, 'basket_value'] = 0
transactions.loc[(transactions['basket_value'] < 6.3) & (transactions['basket_value'] >= 2.9), 'basket_value'] = 0.5
transactions.loc[transactions['basket_value'] >= 6.3, 'basket_value'] = 1

In [None]:
dictionary = {}
for i in PRODUCTS:
    dictionary[i] = transactions[i].sum()
    
    
fig = plt.figure(figsize=(15,5))
sns.barplot(x = list(dictionary.keys()), y = list(dictionary.values()), palette="dark:#5A9_r")
plt.xticks(rotation=45)
plt.show()

In [None]:
fig = plt.figure(figsize=(15,5))
pd.plotting.andrews_curves(transactions.iloc[:100,1:], 'basket_value')
plt.show()

In [None]:
scaleWorlds within Worldsr = preprocessing.MinMaxScaler()
transactions[['recency_days']] = scaler.fit_transform(transactions[['recency_days']])