In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [41]:
df = pd.read_pickle('../data/interim/1.0-ji-initial-data-cleaned.pkl')

In [42]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 36081 entries, 1 to 54998
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Transaction ID       32466 non-null  object        
 1   Customer ID          32503 non-null  object        
 2   Transaction Date     32473 non-null  datetime64[ns]
 3   Operator Name        36081 non-null  category      
 4   Transaction Type     36081 non-null  category      
 5   Transaction Amount   32507 non-null  float64       
 6   Customer Age         32625 non-null  float64       
 7   Customer Gender      32501 non-null  category      
 8   Customer Location    32537 non-null  object        
 9   Service Plan         36081 non-null  category      
 10  Data Usage (MB)      32427 non-null  float64       
 11  Call Duration (min)  32548 non-null  float64       
 12  SMS Sent             32441 non-null  float64       
 13  Internet Package     36081 non-null 

Unnamed: 0,Transaction ID,Customer ID,Transaction Date,Operator Name,Transaction Type,Transaction Amount,Customer Age,Customer Gender,Customer Location,Service Plan,Data Usage (MB),Call Duration (min),SMS Sent,Internet Package,Transaction Status
1,21ca8795-8ad9-47bc-a16b-e4654697ee52,,2023-10-29 02:48:20,9mobile,Bill Payment,4804.1,39.0,other,Kaduna,Postpaid,3440.0,234.3,13.0,Weekly,Failed
2,ea6478bf-6a8c-4d91-bd63-b4464ebc5ae0,3052574b-ae63-4e67-8d86-5f8418ef6f66,2024-04-03 02:22:20,9mobile,Bill Payment,19303.03,,male,Kaduna,Prepaid,,0.9,0.0,Weekly,Completed
3,,15ad55e7-2c9e-4a84-9b9d-9e7d7747a177,2024-02-07 00:22:30,9mobile,Bill Payment,4094.0,,male,Owerri,Prepaid,2250.0,16.38,20.0,Daily,Completed
4,a1165572-b0f4-42e2-9890-eb16ef0b8741,a4adc3b5-2b36-4ba5-b7d9-1a103bfb247a,2008-10-31 21:12:40,Glo,Data Purchase,,36.0,female,Sokoto,Prepaid,3307.97,92.61,58.0,Weekly,Failed
7,07c57156-2d68-433b-92c2-8839db514bc0,,NaT,Glo,Data Purchase,4499.6,49.0,male,Ilorin,Prepaid,3706.69,3.42,59.0,Daily,Pending


**Checking for duplicates in the dataset**

In [20]:
df.duplicated().sum()

np.int64(3272)

In [21]:
# Quick view of duplicated data

df[df.duplicated(keep=False)].sort_values(by='Transaction Date').head()

Unnamed: 0,Transaction ID,Customer ID,Transaction Date,Operator Name,Transaction Type,Transaction Amount,Customer Age,Customer Gender,Customer Location,Service Plan,Data Usage (MB),Call Duration (min),SMS Sent,Internet Package,Transaction Status
24877,4c31d765-c7e4-4b50-b3ff-11320291d8dc,07220b76-0cf5-4fab-b850-b81835fd8a37,1970-01-08 18:47:21,Airtel,Data Purchase,6877.0,64.0,female,Makurdi,Prepaid,1580.41,197.24,96.0,Monthly,Failed
10172,4c31d765-c7e4-4b50-b3ff-11320291d8dc,07220b76-0cf5-4fab-b850-b81835fd8a37,1970-01-08 18:47:21,Airtel,Data Purchase,6877.0,64.0,female,Makurdi,Prepaid,1580.41,197.24,96.0,Monthly,Failed
20352,3a934b73-e4bc-476a-8e22-36cf06972dbc,47bd1cee-56ad-4a0e-915b-0c52bfa08d88,1970-01-21 12:02:36,9mobile,Data Purchase,,28.0,female,Awka,Prepaid,1169.26,222.92,31.0,Monthly,Failed
34114,3a934b73-e4bc-476a-8e22-36cf06972dbc,47bd1cee-56ad-4a0e-915b-0c52bfa08d88,1970-01-21 12:02:36,9mobile,Data Purchase,,28.0,female,Awka,Prepaid,1169.26,222.92,31.0,Monthly,Failed
26818,399910a7-4d07-4910-8e1c-01a336b7c131,6895bb13-8211-4faf-9e99-c98472754f58,1970-03-15 10:36:24,9mobile,Data Purchase,16996.59,19.0,female,Aba,Postpaid,4242.63,277.34,47.0,Monthly,Failed


In [22]:
# Dropping duplicates
df = df.drop_duplicates()

In [23]:
df.shape

(32809, 15)

**Dropping unncessary columns**

The Transaction ID and Customer ID Columns do not appear to be valuable for future analytics so we can drop them

In [24]:
df[['Transaction ID','Customer ID']].nunique()

Transaction ID    29524
Customer ID       29536
dtype: int64

In [25]:
df = df.drop(columns=['Transaction ID','Customer ID'])

**Dealing with missing values**

In [26]:
df.isna().sum()

Transaction Date       3321
Operator Name             0
Transaction Type          0
Transaction Amount     3249
Customer Age           3134
Customer Gender        3259
Customer Location      3247
Service Plan              0
Data Usage (MB)        3310
Call Duration (min)    3231
SMS Sent               3305
Internet Package          0
Transaction Status        0
dtype: int64

The question here wether to perform some imputation method for the missing values or just to drop them. I would rather much prefer to drop them since we will still have a substantial dataset after dropping them because performing some data imputation method means we are creating our own data which is not commensurate with the real data.

In [27]:
df = df.dropna()

In [28]:
df.shape

(14133, 13)

**Getting an overview of the statistical spread of the dataset**

In [29]:
df.describe()

Unnamed: 0,Transaction Amount,Customer Age,Data Usage (MB),Call Duration (min),SMS Sent
count,14133.0,14133.0,14133.0,14133.0,14133.0
mean,9998.714838,44.039624,2491.107208,149.450257,49.406495
std,5735.762858,15.277015,1450.120789,86.418473,29.049537
min,101.03,18.0,0.0,0.0,0.0
25%,5014.55,31.0,1222.66,75.0,24.0
50%,9997.5,44.0,2478.16,148.86,49.0
75%,14927.73,57.0,3754.0,224.77,74.0
max,19998.6,70.0,5000.0,300.0,100.0


In [30]:
df.select_dtypes('category')

0
5
6
9
13
...
36072
36075
36076
36077
36078
