## PROJECT TITLE: CUSTOMER TRANSACTIONS

#### Data Analysis by: Kwabena Boateng

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.preprocessing import LabelEncoder 

### LOAD DATASET

In [None]:
customer_transactions = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/DATA ANALYSIS/PERSONAL/CUSTOMER TRANSACTIONS PROJECT/sample_dataset.csv")

customer_transactions

### DATA UNDERSTANDING

The dataset used for this project is stored in the data/ directory. The data consists of customer transaction records with the following columns:

* customer_id: Unique identifier for each customer.
* Name: First name of the customer.
* Surname: Last name of the customer.
* Gender: the sexual orientation of the customer.
* Birthdate: the day, month and year the customer was born.
* Transaction Amount: Amount of transaction made.
* Date: Date of the transaction.
* Merchant Name: the name of the merchant making the transactions.
* Category: Product category.

### DATA PREPARATION

#### CHECK NUMBER OF COLUMNS AND ROWS IN THE DATASET.

In [None]:
print('The number of rows and columns in this dataset is', customer_transactions.shape, 'respectively')

#### CHECK THE STRUCTURE OF THE DATASET

In [None]:
#Show all the names of the columns in this dataset.

customer_transactions.columns

In [None]:
#Rename columns title.
customer_transactions = customer_transactions.rename(columns={'Name': 'First Name', 'Surname': 'Last Name', 'Birthdate': 'Date of Birth'})

In [None]:
#top first (5)records of the dataset.

customer_transactions.head()

In [None]:
customer_transactions.info()

#### CHECK FOR MISSING VALUES

In [None]:
customer_transactions.isna().sum()

In [None]:
Total_MV = customer_transactions.isna().sum().sum()
print('The total number of missing values in this dataset is', Total_MV)

In [None]:
#Check for the percentage for the missing values.

missing_gender = customer_transactions['Gender'].isnull().sum()
total_rows = len(customer_transactions)
missing_percentage = (missing_gender / total_rows) * 100

print(f"Missing Gender Values: {missing_gender} ({missing_percentage:.2f}%)")

#### INSIGHTS

The results showed that there are 5,047 missing values in the Gender column of the dataset. The percentage of the missing value sums up to 10.09% which is quite a huge percentage.


In dealing with this, I will replace all the missing values in the gender column to UNDEFINED.

This will presuppose that the sexual and gender orientation was not available for them to select, hence they left that field empty.

#### DEALING WITH THE MISSING VALUES IN THE GENDER COLUMN

In [None]:
#Checking for the unique values in the Gender column.

customer_transactions['Gender'].unique()

In [None]:
#Checking for the count of the unique values in the Gender column.

customer_transactions['Gender'].value_counts()

In [None]:
#Replace missing values in the gender column with undefined.

customer_transactions['Gender'].fillna('Undefined', inplace=True)

In [None]:
#Replace gender initials with the full word.

customer_transactions['Gender'] = customer_transactions['Gender'].replace('M', 'Male')
customer_transactions['Gender'] = customer_transactions['Gender'].replace('F', 'Female')

In [None]:
#Checking for the unique values in the Gender column after replacing the missing values and renaming the initials with full word.
customer_transactions['Gender'].unique()

#### CHECKING THE CATEGORY COLUMNS

In [None]:
customer_transactions['Category'].unique()

In [None]:
customer_transactions['Category'].value_counts()

#### CHECK FOR DUPLICATE ROWS

In [None]:
Total_dup = customer_transactions.duplicated().sum()
print('The total number of duplicate rows in this dataset is', Total_dup)

In [None]:
customer_transactions.describe().T

In [None]:
customer_transactions.describe(include='object').T

### EXPLORATORY DATA ANALYSIS, (E.D.A)

#### MAKE A COPY OF THE ORIGINAL DATASET.

In [None]:
customer_transactions_copy = customer_transactions.copy

In [None]:
Cols_drop = ['Customer ID', 'Last Name']
customer_transactions = customer_transactions.drop(columns= Cols_drop)

In [None]:
customer_transactions.boxplot()

### INSIGHTS TO BE DERIVED FROM THIS ANALYSIS

#### QUESTIONS
 - What is the percentage for the various gender in the gender column? 
 - What is the total amount generated by each category?
 - What is the total amount generated by the various gender in connection to the category?
 - What are the top 3 highest transactions in each category and their merchant names?
 - What are the age ranges of customers who patronize the various categories?
 - What is the trend in the transactions?

#### QUESTION ONE

###### - What is the percentage for the various gender in the gender column? 

In [None]:
#Get the value counts for each gender

customer_transactions['Gender'].value_counts()

In [None]:
#Plot the above results into a pie chart.

gender_counts = customer_transactions['Gender'].value_counts()

total_count = len(customer_transactions)

gender_percentages = (gender_counts / total_count) * 100


# Create a pie chart
plt.figure(figsize=(7, 7))

plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140)

plt.title('Gender Distribution Percentage')

# Add a legend
plt.legend(gender_percentages.index, title="Gender Percentage")

plt.show()

#### INSIGHTS

There are six (6) categories in this dataset namely, Cosmetic, Travel, Clothing, Electronics, Restaurant and Market.

Below are their outputs / results in descending order:

- Restaurant  - 8413 - 16.8%
- Market      - 8382 - 16.8%
- Travel      - 8377 - 16.8%
- Electronics - 8324 - 16.6%
- Clothing    - 8261 - 16.5%
- Cosmetic    - 8243 - 16.5%

#### QUESTION TWO

###### - What is the total amount generated by each category? 

In [None]:
# Calculate the total amount for each category
category_totals = customer_transactions.groupby('Category')['Transaction Amount'].sum().reset_index()

# Display the result
print(category_totals)

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(category_totals['Category'], category_totals['Transaction Amount'])
plt.xlabel('Category')
plt.ylabel('Transaction Amount')
plt.title('Total Amount by Category')

# Rotate the x-axis labels for better readability (optional)
plt.xticks(rotation=45)

plt.show()

#### QUESTION THREE

###### - What is the total amount generated by the various gender in connection to the category?

In [None]:
gender_category_totals = customer_transactions.groupby(['Gender', 'Category'])['Transaction Amount'].sum().reset_index()
gender_category_totals 

In [None]:
# Set the figure size
plt.figure(figsize=(12, 6))

# Grouped bar chart
category_names = gender_category_totals['Category'].unique()
width = 0.30
x = range(len(category_names))

for i, gender in enumerate(gender_category_totals['Gender'].unique()):
    gender_data = gender_category_totals[gender_category_totals['Gender'] == gender]
    amounts = gender_data['Transaction Amount']
    plt.bar([pos + i * width for pos in x], amounts, width, label=gender)

plt.xlabel('Category')
plt.ylabel('Total Amount')
plt.title('Total Amount by Gender and Category')
plt.xticks([pos + width for pos in x], category_names)
plt.legend(title='Gender')

# Rotate the x-axis labels for better readability (optional)
plt.xticks(rotation=45)

plt.show()

#### QUESTION FOUR

###### - What are the top 3 highest transactions in each category and their merchant names.

In [None]:

customer_transactions = pd.DataFrame(customer_transactions)

# Find the top 3 highest transactions in each category
result = customer_transactions.groupby('Category').apply(lambda x: x.nlargest(3, 'Transaction Amount'))[['Merchant Name', 'Category', 'Transaction Amount']].reset_index(drop=True)

print(result)


In [None]:
# Create a bar plot
plt.figure(figsize=(10, 8))
for category, group in result.groupby('Category'):
    plt.bar(group['Merchant Name'], group['Transaction Amount'], label=f'Category {category}')

# Add labels and a title
plt.xlabel('Merchant Name')

plt.ylabel('Transaction Amount')

plt.title('Top 3 Highest Transactions in Each Category')

# Rotate the x-axis labels for better readability (optional)
plt.xticks(rotation=80)

plt.legend()

plt.show()

#### QUESTION FIVE

###### - What are the age ranges of customers who patronize the various categories?

In [None]:
# Convert the 'Date of Birth' column to datetime
customer_transactions['Date of Birth'] = pd.to_datetime(customer_transactions['Date of Birth'])

# Calculate age based on the current date
current_date = datetime.now()
customer_transactions['Age'] = (current_date - customer_transactions['Date of Birth']) // pd.Timedelta(days=365)

customer_transactions

#### CONVERT THE BIRTHDATE INTO A DATETIME TO PLOT IT OUT AS A GRAPH

In [None]:
# Define age bins and labels
age_bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 100]

age_labels = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65', '66-70', '71-75', '76-80', '81-85', '86-90', '91-95']

# Apply age grouping
customer_transactions['Age Group'] = pd.cut(customer_transactions['Age'], bins=age_bins, labels=age_labels, right=False)

# Histogram
plt.figure(figsize=(10, 8))

sns.histplot(data=customer_transactions, x='Age', kde=True)

plt.title('Age Distribution')

plt.xlabel('Age Group ')

plt.ylabel('Frequency')

plt.show()

In [None]:
customer_transactions.head()

In [None]:

customer_transactions = pd.DataFrame(customer_transactions)

# Create the 'Age Group' column
age_bins = [20, 40, 60, 80, 100]  # Define age group boundaries
age_labels = ['Youth', 'Adult', 'elderly', 'Aged']  # Define age group labels
customer_transactions['Age Group'] = pd.cut(customer_transactions['Age'], bins=age_bins, labels=age_labels)

# Create a cross-tabulation (contingency table) between 'Age Group' and 'Category'
cross_tab = pd.crosstab(customer_transactions['Age Group'], customer_transactions['Category'])

# Plot a bar chart to visualize the relationship
cross_tab.plot(kind='bar')

plt.xlabel('Age Group')

plt.ylabel('Count')

plt.title('Bivariate Analysis: Age Group vs. Category')

plt.legend(title='Category')

plt.show()


#### QUESTION SIX

###### - Which date had the highest transactions.


In [None]:
customer_transactions['Date'] = pd.to_datetime(customer_transactions['Date'])

In [None]:
customer_transactions.set_index('Date', inplace=True)

In [None]:
date_min = customer_transactions['Date'].min()
date_max = customer_transactions['Date'].max()

In [None]:
print(f"Start Date: {date_min}")
print(f"End Date: {date_max}")

In [None]:
resampled_data = customer_transactions.resample('D').sum()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(resampled_data.index, resampled_data['Transaction Amount'], marker='o')
plt.title('Transaction Activity Over Time')
plt.xlabel('Date')
plt.ylabel('Transaction Amount')
plt.grid(True)
plt.show()

In [None]:
max_transaction_date = resampled_data.idxmax()
max_transaction_count = resampled_data.max()

print("Date with the highest transaction count:", max_transaction_date['TransactionCount'])
print("Highest transaction count:", max_transaction_count['TransactionCount'])


In [None]:
#ax = sns.countplot(customer_transactions, x='Gender')

#for bars in ax.containers:
    
      #ax.bar_label(bars)
        
#plt.title('Distribution of Gender')

#plt.ylabel('Gender Count')

#plt.show()

In [None]:
#gender_counts = customer_transactions['Gender'].value_counts()

# Create a pie chart
#plt.figure(figsize=(6, 6))
#plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)

# Add a title and legend
#plt.title('Percentage of Gender Column')
#plt.legend(gender_counts.index, loc='upper right', bbox_to_anchor=(1.2, 1))

# Show the pie chart
#plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
#plt.show()

In [None]:
customer_transactions = pd.DataFrame(customer_transactions)

# Convert the 'Date of Birth' column to datetime
customer_transactions['Date of Birth'] = pd.to_datetime(customer_transactions['Date of Birth'])

# Calculate age based on the current date
current_date = datetime.now()
customer_transactions['Age'] = (current_date - customer_transactions['Date of Birth']) // pd.Timedelta(days=365)

customer_transactions

In [None]:
# Define age bins and labels
age_bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 100]

age_labels = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65', '66-70', '71-75', '76-80', '81-85', '86-90', '91-95']

# Apply age grouping
customer_transactions['AgeGroup'] = pd.cut(customer_transactions['Age'], bins=age_bins, labels=age_labels, right=False)

# Histogram
plt.figure(figsize=(10, 8))

sns.histplot(data=customer_transactions, x='AgeGroup', kde=True)

plt.title('Age Distribution')

plt.xlabel('Age Group')

plt.ylabel('Frequency')

plt.show()

In [None]:
# Convert 'Birthdate' to datetime
#customer_transactions['Date of Birth'] = pd.to_datetime(customer_transactions['Date of Birth'])

# Calculate age
#current_date = pd.Timestamp.now()

#customer_transactions['Age'] = (current_date - customer_transactions['Date of Birth']).dt.days // 365  # Calculate age in years

# Define age bins and labels
#age_bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 100]

#age_labels = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65', '66-70', '71-75', '76-80', '81-85', '86-90', '91-95']

# Apply age grouping
#customer_transactions['AgeGroup'] = pd.cut(customer_transactions['Age'], bins=age_bins, labels=age_labels, right=False)

# Histogram
#plt.figure(figsize=(10, 8))

#sns.histplot(data=customer_transactions, x='AgeGroup', kde=True)

#plt.title('Age Distribution')

#plt.xlabel('Age Group')

#plt.ylabel('Frequency')

#plt.show()

In [None]:
cross_tab = pd.crosstab(customer_transactions['Category'], customer_transactions['Age'])

# Create a heatmap to visualize the cross-tabulation
plt.figure(figsize=(15, 8))

sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlGnBu', cbar=True)

plt.title('Bivariate Analysis: Age Group vs. Category')

plt.xlabel('Category')

plt.ylabel('Age Group')

plt.show()