In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl

mpl.style.use("seaborn-v0_8-deep")
mpl.rcParams["figure.figsize"] = (14, 5)
mpl.rcParams["figure.dpi"] = 100

### Loading the datafile for cleaning and quick view

In [60]:
data = pd.read_csv('../data/raw/nigeria_telecom_transactions_messy_actual_cities.csv')

print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55000 entries, 0 to 54999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Transaction ID       49431 non-null  object 
 1   Customer ID          49515 non-null  object 
 2   Transaction Date     49445 non-null  object 
 3   Operator Name        55000 non-null  object 
 4   Transaction Type     55000 non-null  object 
 5   Transaction Amount   49566 non-null  object 
 6   Customer Age         49602 non-null  float64
 7   Customer Gender      49534 non-null  object 
 8   Customer Location    49584 non-null  object 
 9   Service Plan         55000 non-null  object 
 10  Data Usage (MB)      49533 non-null  object 
 11  Call Duration (min)  49587 non-null  object 
 12  SMS Sent             49467 non-null  float64
 13  Internet Package     55000 non-null  object 
 14  Transaction Status   55000 non-null  object 
dtypes: float64(2), object(13)
memory usa

Unnamed: 0,Transaction ID,Customer ID,Transaction Date,Operator Name,Transaction Type,Transaction Amount,Customer Age,Customer Gender,Customer Location,Service Plan,Data Usage (MB),Call Duration (min),SMS Sent,Internet Package,Transaction Status
0,66d1c0da-4597-4458-b4db-b2e94fe9c69f,f5754858-fe7d-4cc9-a1ff-80ce6d64ca90,,Airtel,Ar1tel,,27.0,,Minna,Postpaid,4826.08,214.72,51.0,Daily,Pending
1,21ca8795-8ad9-47bc-a16b-e4654697ee52,,2023-10-29T02:48:20.227058,9mobile,Bill Payment,4804.1,39.0,Other,Kaduna,Postpaid,3440.0,234.3,13.0,Weekly,Failed
2,ea6478bf-6a8c-4d91-bd63-b4464ebc5ae0,3052574b-ae63-4e67-8d86-5f8418ef6f66,2024-04-03T02:22:20.359918,9mobile,Bill Payment,19303.03,,male,Kaduna,Prepaid,,0.9,0.0,Weekly,Completed
3,,15ad55e7-2c9e-4a84-9b9d-9e7d7747a177,2024-02-07T00:22:30.827156,9mobil3,Bill Payment,4094.0,,Male,Owerri,Prepaid,2250.0,$16.38,20.0,Daily,Completed
4,a1165572-b0f4-42e2-9890-eb16ef0b8741,a4adc3b5-2b36-4ba5-b7d9-1a103bfb247a,2008-10-31 21:12:40,Glo,Data Purchase,,36.0,female,Sokoto,Prepaid,3307.97,$92.61,58.0,Weekly,Failed


In [61]:
# Copying the data to a new dataframe so as to have what to compare the data to after cleaning
df = data.copy()

### Checking each Feature for errors in entry and fixing the datatype if need be

**Reusable Number of Missing Values Function**

In [62]:
def missing_values(column_name):
    """
    This function calculates and returns the number of missing values present in a specified column of the DataFrame.
    """
    return (f"Number of Missing values in the {column_name} column: {df[column_name].isnull().sum()}")

In [63]:
# Getting the percentage of the missing values by column
df.isna().mean() * 100

Transaction ID         10.125455
Customer ID             9.972727
Transaction Date       10.100000
Operator Name           0.000000
Transaction Type        0.000000
Transaction Amount      9.880000
Customer Age            9.814545
Customer Gender         9.938182
Customer Location       9.847273
Service Plan            0.000000
Data Usage (MB)         9.940000
Call Duration (min)     9.841818
SMS Sent               10.060000
Internet Package        0.000000
Transaction Status      0.000000
dtype: float64

It appears there are missing values in about 10 columns but compared to the size of the dataset the missing data is only about 10% of the entire dataset.
The decision to eithr remove the missing values or impute them would be made later. 

**Transaction Date Column**

In [65]:
# Getting an understanding of the types of values in the column
df['Transaction Date'].sample(8)

5730     2024-01-03T12:32:32.828789
32737           1982-05-02 22:07:21
23329    2024-06-23T22:31:54.610206
11877    2024-03-01T12:35:45.459189
4544                            NaN
34258    2024-04-15T06:41:26.782709
29593    2024-07-10T10:52:22.314394
23391    2024-05-21T10:27:27.460447
Name: Transaction Date, dtype: object

In [66]:
# Converting the Transaction Date column to datetime and removing the nano-seconds from it
df['Transaction Date'] = (
                            pd.to_datetime
                            (
                                df['Transaction Date']
                                .str.replace('T', ' ', regex=False)
                                .str.split('.')
                                .str[0]
                                )
                        )


In [67]:
df['Transaction Date'].sample(5)

34013   2024-02-16 21:05:50
22279   2023-11-25 16:27:58
4537    1980-02-08 02:51:49
15606   2024-02-07 15:04:44
35451   2024-02-13 11:31:54
Name: Transaction Date, dtype: datetime64[ns]

In [68]:
# Confirming that the number of null values in 'Transaction Date' is equal to df['Transaction Date'] after convertion
num_nulls_original = data['Transaction Date'].isnull().sum()
num_nulls_modified = df['Transaction Date'].isnull().sum()
assert num_nulls_original == num_nulls_modified, "The number of null values in 'Transaction Date' must be the same before and after modifications."

# Confirming that all the values follow the same format
assert (df['Transaction Date'].dropna().dt.strftime('%Y-%m-%d %H:%M:%S') == df['Transaction Date'].dropna()).all(), "Not all non-NaT dates follow the '%Y-%m-%d %H:%M:%S' format"

**Operator Name Column**

- Getting the types of values in the column

In [70]:
# Getting the types of values in the column
df['Operator Name'].value_counts()

Operator Name
MTN        12560
Airtel     12340
9mobile    12322
Glo        12284
Ar1tel      1413
G1o         1397
9mobil3     1387
MNT         1297
Name: count, dtype: int64

- Correcting the spelling errors

In [71]:
correction_map = {
    'Ar1tel': 'Airtel',
    'G1o': 'Glo',
    '9mobil3': '9mobile',
    'MNT': 'MTN'
}

# Apply the corrections
df['Operator Name'] = df['Operator Name'].replace(correction_map)

In [72]:
df['Operator Name'].value_counts()

Operator Name
MTN        13857
Airtel     13753
9mobile    13709
Glo        13681
Name: count, dtype: int64

**Transaction Type Column**

In [74]:
# Getting the types of values in the column
df['Transaction Type'].value_counts()

Transaction Type
Bill Payment        16548
Airtime Purchase    16536
Data Purchase       16455
9mobil3              1430
Ar1tel               1345
G1o                  1343
MNT                  1343
Name: count, dtype: int64

it seems there' an error in entry where operators names where included as collection type.
We would have to remove these rows(entries) since there's now way to know how to map them to actual transaction types

In [75]:
# Filtering out the rows with the wrong transaction type
df = df[~df['Transaction Type'].isin(['9mobil3', 'Ar1tel', 'G1o', 'MNT'])]
df['Transaction Type'] = df['Transaction Type'].astype('category')

**Transaction Amount**

In [77]:
# Checking to see the values in the column
df['Transaction Amount'].sample(10)

20699        16913
47355    $19126.68
32931      14369.0
23278        15447
26871          NaN
42978    $17145.78
1292      10145.79
54630     10146.44
25930    $13292.14
37902      $580.22
Name: Transaction Amount, dtype: object

We can see that some of the entries have $ signs in front and possibly whitespaces.
So we have to deal with those first before converting the column to float

In [78]:
df['Transaction Amount'] =(
                            df['Transaction Amount']
                            .str.strip()
                            .str.replace('$','')
                            .astype('float')
                        )

In [79]:
# Making sure we did not create more missing values when cleaning the column
assert (df['Transaction Amount'].isna().sum()) == 4867 , 'There are more missing values than expected'

**Customer Age**

In [81]:
# Checking the values in the column
df['Customer Age'].sample(10)

12170    30.0
39914    28.0
51274    56.0
52260    61.0
22901    43.0
23341    48.0
71       43.0
35830    18.0
46214    31.0
10867    31.0
Name: Customer Age, dtype: float64

In [82]:
# Checking if their are outliers that might be errors
df['Customer Age'].describe()

count    44705.000000
mean        44.062141
std         15.297023
min         18.000000
25%         31.000000
50%         44.000000
75%         57.000000
max         70.000000
Name: Customer Age, dtype: float64

The Min, Max and Mean age are well within expected ranges so it appears there's no outlier or errors in age entry

**Customer Gender**

In [84]:
# Understanding the types of values in the column
df['Customer Gender'].value_counts()

Customer Gender
male      8979
Female    8978
Other     8965
female    8867
Male      8812
Name: count, dtype: int64

It appears that there are spelling inconsistencies with the gender column and also a gender type "Other". The "Other" type are probably people who didn't want to indicate their gender so we will leave as is and only correct the spelling inconsistency.

In [90]:
gender_map = {
    'Male': 'male',
    'Female':'female',
    'Other': 'other'
}
df['Customer Gender'] = df['Customer Gender'].replace(gender_map, regex=False)

np.int64(4938)

Index(['Transaction ID', 'Customer ID', 'Transaction Date', 'Operator Name',
       'Transaction Type', 'Transaction Amount', 'Customer Age',
       'Customer Gender', 'Customer Location', 'Service Plan',
       'Data Usage (MB)', 'Call Duration (min)', 'SMS Sent',
       'Internet Package', 'Transaction Status'],
      dtype='object')