In [1]:
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt

In [2]:
# Define the directory where your data files are located
data_dir = os.path.join('..')

# Construct file paths using os.path.join()
file_facts = os.path.join(data_dir, 'PostNL_account_delivery_facts_anonymized.csv')
file_preference = os.path.join(data_dir, 'PostNL_account_delivery_preference_anonymized.csv')
file_packages = os.path.join(data_dir, 'PostNL_collo_packages_anonymized.csv')

# Read data into data framesgit 
df_facts = pd.read_csv(file_facts)
df_preference = pd.read_csv(file_preference)
df_packages = pd.read_csv(file_packages)

# PostNL account delivery facts
* The analysis shows that there are no nans or abnormal variables present in the data. 
* The total number of parcels varies month to month, with some months showing higher variability.
* There's a difference between the number of parcels delivered and those successfully delivered to the home on the first try.
* Certain months may have anomalies or outliers in the number of parcels delivered which could be due to various factors (seasonal demand, promotions, shipping delays, etc.).
* The scatter plot with jitter effectively shows the distribution and density of the data points for each month, which the violin plot abstracts into a density shape.

In [3]:
df_facts.head()

In [4]:
df_facts.dtypes

In [5]:
df_facts.describe()

In [6]:
sns.boxplot(data=df_facts[['number_of_parcels', 'parcels_home_1st']])
plt.title('Boxplot of number_of_parcels and parcels_home_1st')
plt.xlabel('Parcel type')
plt.ylabel('Amount of parcels')
plt.show()

In [7]:
#Makes a visualisation of the number of parcels delivered each month 
df_facts_grouped_date = df_facts[['month_id', 'number_of_parcels', 'parcels_home_1st']].copy() #remove hased id 
df_facts_grouped_date = df_facts_grouped_date.groupby('month_id').sum()
df_facts_grouped_date.reset_index(inplace=True)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(df_facts_grouped_date['month_id'], df_facts_grouped_date['number_of_parcels'],
         marker='o', label='Number of Parcels')
plt.plot(df_facts_grouped_date['month_id'], df_facts_grouped_date['parcels_home_1st'],
         marker='o', label='Parcels Home 1st')
plt.title('Number of Parcels and Parcels Home 1st Over Time')
plt.xlabel('Month ID')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [8]:
#subset of data only containing the month id and the number of parcels
df_month_parcels = df_facts[['month_id', 'number_of_parcels']]

# Convert month_id to a more readable format, "YYYY-MM", to make the plot more understandable
df_month_parcels['month_id'] = pd.to_datetime(df_month_parcels['month_id'], format='%Y%m').dt.strftime('%Y-%m')

# Create the violin plot
plt.figure(figsize=(15, 6))  # Adjust the figure size as necessary
sns.violinplot(x='month_id', y='number_of_parcels', data=df_month_parcels)
plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability
plt.title('Number of Parcels per Month')  # You can customize the title
plt.xlabel('Month')  # X-axis label
plt.ylabel('Number of Parcels')  # Y-axis label
plt.tight_layout()  # Adjust the layout to make sure everything fits without overlapping
plt.show()

In [9]:
plt.figure(figsize=(14, 7))
sns.stripplot(x='month_id', y='number_of_parcels', data=df_month_parcels, jitter=True)
plt.xticks(rotation=45)
plt.title('Scatter Plot with Jitter for Number of Parcels per Month')
plt.xlabel('Month')
plt.ylabel('Number of Parcels')
plt.tight_layout()
plt.show()

# Post NL preferences
* The type of the columns that contain a date are objects however date time might be more suitable for analysis.
* This data **cannot** provide you information whether a customer changed their delivery preference since there are no duplicate accounts present. 
* Most accounts have OriginalDeliveryLocation as their preference, which I assume is their house.
* There are some NaN delivery preferences which should be changed to NULL values in the DB.
* I assume that all rows for which "datelastupdated" =! Nan the account has changed their delivery preference, but it should be validated. 
* Around 70% of the accounts does not have information about when the delivery preference was updated and created, therefore I assume that 70% of the users have not changed their delivery preference.
* Most packages are delivered on Tuesdays

In [10]:
df_preference.head()

In [11]:
#Check type
df_preference.dtypes

In [12]:
# Convert date columns from object to datetime
df_preference['datelastupdated'] = pd.to_datetime(df_preference['datelastupdated'])
df_preference['datecreated'] = pd.to_datetime(df_preference['datecreated'])

In [13]:
#Check type
df_preference.dtypes

In [14]:
#prints unique delivery preferences in the data
print(df_preference['deliverypreference'].unique())

In [15]:
#Check for double account_id
duplicate_count = df_preference['account_id_hashed'].duplicated().sum()
print("Amount of duplicate account ids: {}".format(duplicate_count))

In [16]:
#Amount of accounts for which there is no delivery preference known 
nan_count = df_preference['deliverypreference'].isna().sum()
print("Amount of accounts for which there is no delivery preference known:", nan_count)

#Change the NaN value to Unknown
df_preference['deliverypreference'] = df_preference['deliverypreference'].fillna('Unknown')
df_preference

In [17]:
# Count the occurrence of each delivery preference
preference_counts = df_preference['deliverypreference'].value_counts()

# Plotting
plt.figure(figsize=(10, 6))
preference_counts.plot(kind='bar')
plt.xlabel('Delivery Preference')
plt.ylabel('Count')
plt.title('Delivery Preference Distribution')
plt.xticks(rotation=45)
plt.show()

In [18]:
nan_count = df_preference['datecreated'].isna().sum() #counts nans
not_nan = len(df_preference) - nan_count #counts not nans
percent = round(((not_nan/nan_count)*100), 1) #calculates percentage that is not nan

print("Amount of accounts for which there is no date created:", nan_count)
print("So amount of account for which there is a date created:", not_nan)
print("Therefore {}% of the accounts have a date created".format(percent))

In [19]:
nan_count = df_preference['datelastupdated'].isna().sum() #counts nans
not_nan = len(df_preference) - nan_count #counts not nans
percent = round(((not_nan/nan_count)*100), 1) #calculates percentage that is not nan

print("Amount of accounts for which there is no date updated:", nan_count)
print("So amount of account for which there is a date date updated:", not_nan)
print("Therefore {}% of the accounts have a date updated".format(percent))

In [20]:
df_packages.head()

In [21]:
df_packages.columns

In [22]:
df_packages.dtypes

In [23]:
#Change all columns that contain an date or time to a datetime format for further analysis 
date_time_columns_list = list(df_packages.columns)[2:14]

for column in date_time_columns_list:
    # Using errors='coerce' to convert out-of-bounds or unparseable dates to NaT
    df_packages[column] = pd.to_datetime(df_packages[column], errors='coerce')
    
    if 'tijd' in column.lower(): 
        df_packages[column] = df_packages[column].dt.time

df_packages

In [24]:
# Check for duplicate rows
duplicate_rows = df_packages[df_packages.duplicated()]
duplicate_rows

**Account ID**

In [25]:
#Checks how many packages are linked to an account
packages_without_account = df_packages['account_id_hashed'].isna().sum()
percentage_not_linked = round(((packages_without_account/len(df_packages))*100), 1)

print("The total amont of packages which are not linked to a specific acount:", packages_without_account)
print("Precentage of packages not linked to an account: {}%".format(percentage_not_linked))

In [26]:
# Remove rows where 'account_id_hashed' is NaN
df_cleaned = df_packages.dropna(subset=['account_id_hashed'])

# Now, find duplicates in the cleaned DataFrame
duplicated_mask = df_cleaned['account_id_hashed'].duplicated(keep=False)
duplicated_account_ids = df_cleaned[duplicated_mask]
duplicated_account_ids

In [27]:
# Check id which has ordered the most packages according to df_packages
most_frequent_account_id_hashed = df_packages['account_id_hashed'].value_counts().idxmax()
print(most_frequent_account_id_hashed)

In [28]:
df_packages[df_packages['account_id_hashed'] == '696d0cff331b6b26d8672faa9285fdda7bbb7d616ca122ed6880020e00311313']

**Barcodes**

In [29]:
#Check amount of duplicate barcodes
duplicate_barcodes = df_packages[df_packages.duplicated('dn_barcode', keep=False)]['dn_barcode'].unique()
print("Amount of duplicate barcodes:", len(duplicate_barcodes))

**Landcode**

In [30]:
#Are there any packages delived else then NL
df_packages['da_landcode_gea'].unique()

**Eindstatus**

In [31]:
df_eindstatus = df_packages[["account_id_hashed", "da_datum_eindstatus", "da_tijd_eindstatus"]]
df_eindstatus

In [32]:
# Convert 'da_datum_eindstatus' to datetime to extract the day of the week
df_packages['da_datum_eindstatus'] = pd.to_datetime(df_packages['da_datum_eindstatus'])
df_packages['weekday'] = df_packages['da_datum_eindstatus'].dt.day_name()

# Convert 'da_tijd_eindstatus' to a categorical time slot (consider hourly slots)
df_packages['hour'] = pd.to_datetime(df_packages['da_tijd_eindstatus'], format='%H:%M:%S').dt.hour

# Aggregate data to count occurrences per weekday and hour
heatmap_data = df_packages.groupby(['hour', 'weekday']).size().unstack(fill_value=0)

# Sort the data by days of the week in correct order
sorter = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# Ensure the columns are in the correct weekday order
heatmap_data = heatmap_data[sorter]

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap="YlGnBu", annot=False, cbar_kws={'label': 'Frequency'})
plt.title('Frequency of Status End Times by Day of Week and Hour')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.xticks(rotation=45)
plt.show()


**Waarnemings sequence**

Analysis shows that waarnemings sequence cannot be derived from the columns in the data

In [33]:
df_waarneming = df_packages[['da_waarnemingsequence']].copy()
df_waarneming['amount'] = 1
df_waarneming_grouped = df_waarneming.groupby('da_waarnemingsequence').sum()
df_waarneming_grouped.reset_index()

In [34]:
df_waarneming_grouped['amount'].describe()

In [35]:
# Sort the DataFrame by the column containing the values
sorted_df = df_waarneming_grouped.sort_values(by='amount', ascending=False)

# Keep only the top 10 values
top_10 = sorted_df.head(10)
top_10

In [36]:
def find_similarities(df, da_waarnemingsequence_values=None):
    
    """
    Identifies columns with identical values within groups of a DataFrame, grouped by 'da_waarnemingsequence'.
    It filters by specific values if provided, then returns a dictionary of these similarities for each group.
    Goal of doing this is checking whether rows with the same 'da_waarnemingsequence' have any other column values in common.git
    """
        
    # If specific values are provided, filter the DataFrame to include only those groups
    if da_waarnemingsequence_values is not None:
        if isinstance(da_waarnemingsequence_values, list):
            df = df[df['da_waarnemingsequence'].isin(da_waarnemingsequence_values)]
        else:
            df = df[df['da_waarnemingsequence'] == da_waarnemingsequence_values]
    
    # Grouping the DataFrame by 'da_waarnemingsequence'
    grouped = df.groupby('da_waarnemingsequence')
    results = {}

    for name, group in grouped:
        # Dictionary to hold similarities for the current group
        similarities = {}
        for column in group.columns:
            # Skip the grouping column
            if column == 'da_waarnemingsequence':
                continue
            # Check if all values in the column are the same
            if group[column].nunique() == 1:
                # Add to similarities
                similarities[column] = group[column].iloc[0]
        # Add the similarities for the current group to the results
        if similarities:
            results[name] = similarities
    
    return results

# Example usage for a specific value of 'da_waarnemingsequence'
similarities = find_similarities(df_packages, 'A98A01A95B01A96J01J40A19A19J05I01')
print(similarities)

# Example usage for multiple specific values of 'da_waarnemingsequence'
# similarities = find_similarities(df_packages, ['value1', 'value2'])
# print(similarities)

# Delivery facts & collo packages

Here I analyse whether the person that received the most packages in the delivery facts also appears as many times in the collo packages because this should be consistent.

Analysis shows that it is not consistent, further validation why is needed. 

In [37]:
#check if the range of the dates are comparable 

# Finding the earliest date
min_date_packages = df_packages['da_datum_eindstatus'].min()
min_date_facts = df_facts['month_id'].min()

# Finding the latest date
max_date_packages = df_packages['da_datum_eindstatus'].max()
max_date_facts = df_facts['month_id'].max()

print(f"The date range from df_packages is from {min_date_packages} to {max_date_packages}")
print(f"The date range from df_facts is from {min_date_facts} to {max_date_facts}")

In [38]:
#Check who received the most packages according to the delivery facts 

#groups account by id 
df_total_parcels_per_id = df_facts[['account_id_hashed', 'number_of_parcels']].copy().groupby('account_id_hashed').sum()

# Sort the DataFrame by the column containing the values
sorted_df = df_total_parcels_per_id.sort_values(by='number_of_parcels', ascending=False)
sorted_df = sorted_df.reset_index()
sorted_df

In [39]:
# Check the months in which the customer received packages
account_most_packages = sorted_df['account_id_hashed'].iloc[0]
df_facts[df_facts['account_id_hashed'] == account_most_packages]

In [40]:
#Check if this customer also appears as many times in the collo packages
df_packages[df_packages['account_id_hashed'] == account_most_packages]

# Possibility for data enrichment 
We could add additional data to prove that the model can perform with newly added data, in this case 4 digit postal code information in the Netherlands.
Github repo: https://github.com/bobdenotter/4pp