In [1]:
import pandas as pd
import numpy as np

## 1. Downloading the data to dataframes 

In [2]:
df_customer = pd.read_excel('./Datasets/Credit Card Data.xlsx',sheet_name='Customer Details')
df_customer.head()

Unnamed: 0,No,Customer,Age,City,Product,Limit,Company,Segment
0,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed
1,2,A2,56,CALCUTTA,Silver,300000,C2,Salaried_MNC
2,3,A3,30,COCHIN,Platimum,540000,C3,Salaried_Pvt
3,4,A4,22,BOMBAY,Platimum,840084,C4,Govt
4,5,A5,59,BANGALORE,Platimum,420084,C5,Normal Salary


In [3]:
df_spend = pd.read_excel('./Datasets/Credit Card Data.xlsx',sheet_name='Spend')
df_spend.head()

Unnamed: 0,Sl No:,Customer,Month,Type,Amount
0,1,A1,2004-01-12,JEWELLERY,344054.980813
1,2,A1,2004-01-03,PETRO,935.495203
2,3,A1,2004-01-15,CLOTHES,8687.895474
3,4,A1,2004-01-25,FOOD,341.159711
4,5,A1,2005-01-17,CAMERA,3406.639477


In [4]:
df_repayment = pd.read_excel('./Datasets/Credit Card Data.xlsx',sheet_name='Repayment')
df_repayment.head()

Unnamed: 0,SL No:,Customer,Month,Amount
0,1,A1,2006-05-15,230847.3
1,2,A1,2005-08-27,1835.124
2,3,A1,2004-03-07,4858.701
3,4,A1,2005-03-01,1360527.0
4,5,A1,2004-02-14,190232.2


## 2. Checking for data types and missing values

In [5]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   No        100 non-null    int64 
 1   Customer  100 non-null    object
 2   Age       100 non-null    int64 
 3   City      100 non-null    object
 4   Product   100 non-null    object
 5   Limit     100 non-null    int64 
 6   Company   100 non-null    object
 7   Segment   100 non-null    object
dtypes: int64(3), object(5)
memory usage: 6.4+ KB


All the data types looks to be okay for customer

In [6]:
df_customer.isnull().sum()

No          0
Customer    0
Age         0
City        0
Product     0
Limit       0
Company     0
Segment     0
dtype: int64

No missing values found, lets explore other data frames.

In [7]:
df_spend.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Sl No:    1500 non-null   int64         
 1   Customer  1500 non-null   object        
 2   Month     1500 non-null   datetime64[ns]
 3   Type      1500 non-null   object        
 4   Amount    1500 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 58.7+ KB


All the data types looks to be okay for spend dataframe

In [8]:
df_spend.isnull().sum()

Sl No:      0
Customer    0
Month       0
Type        0
Amount      0
dtype: int64

No missing values found, lets explore repayment.

In [9]:
df_repayment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807 entries, 0 to 806
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   SL No:    807 non-null    int64         
 1   Customer  807 non-null    object        
 2   Month     803 non-null    datetime64[ns]
 3   Amount    807 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 25.3+ KB


In [10]:
df_repayment.isnull().sum()

SL No:      0
Customer    0
Month       4
Amount      0
dtype: int64

4 rows have misisng date columns.

In [11]:
df_repayment.isnull().sum()*100/df_repayment.shape[0]

SL No:      0.000000
Customer    0.000000
Month       0.495663
Amount      0.000000
dtype: float64

The number rows with missing values is less than one percent. So lets drop the rows.

In [12]:
df_repayment[df_repayment.Month.isnull()] ## Checking the rows before dropping.

Unnamed: 0,SL No:,Customer,Month,Amount
274,275,A29,NaT,139343.954497
299,300,A30,NaT,75520.69509
418,419,A40,NaT,4203.395579
789,790,A9,NaT,20150.63674


In [13]:
df_repayment = df_repayment[~df_repayment.Month.isnull()] ## Dropping the rows with null values.

### Task 2: 
Age of credit card customer can't be less than 18. In case age is less than 18, replace it
with mean of age.


In [14]:
mask = df_customer.Age<18

3 Customers have the age below 18, and it has to be replaced with mean.

In [15]:
df_customer.loc[mask,'Age'] = round(df_customer.Age.mean())

### Task 3: Data Exploration

a. How many unique customers are there?

In [16]:
print('Number of unique customers are ',df_customer['Customer'].nunique())

Number of unique customers are  100


b. How many spend categories are there?

In [17]:
print("Number of spend categories are ", len(df_spend['Type'].unique()))

Number of spend categories are  15


c. Which category has the highest value spend

In [18]:
df_spend.groupby(['Type'])['Amount'].mean()

Type
AIR TICKET      254632.194082
AUTO             27320.763977
BIKE            210701.269489
BUS TICKET       12681.030717
CAMERA           21499.802175
CAR             409143.467018
CLOTHES          25140.160688
FOOD               341.168321
JEWELLERY       239218.687225
MOVIE TICKET      1875.298919
PETRO              549.483953
RENTAL           13106.511138
SANDALS           2516.628761
SHOPPING          7394.332039
TRAIN TICKET      1627.490364
Name: Amount, dtype: float64

In [19]:
print(df_spend.groupby(['Type'])['Amount'].mean().idxmax()," has the highest spend in the category")

CAR  has the highest spend in the category


d. What is the average monthly spent by product categories?


In [20]:
print('Average monthly spend by product categories')
df_spend.groupby(['Type'])['Amount'].mean()

Average monthly spend by product categories


Type
AIR TICKET      254632.194082
AUTO             27320.763977
BIKE            210701.269489
BUS TICKET       12681.030717
CAMERA           21499.802175
CAR             409143.467018
CLOTHES          25140.160688
FOOD               341.168321
JEWELLERY       239218.687225
MOVIE TICKET      1875.298919
PETRO              549.483953
RENTAL           13106.511138
SANDALS           2516.628761
SHOPPING          7394.332039
TRAIN TICKET      1627.490364
Name: Amount, dtype: float64

e. Which customers are reaching 90% or more of their spending limit?

In [21]:
customer_spend = df_spend.groupby(['Customer'])['Amount'].sum()

In [22]:
df_customer_spend = pd.DataFrame({'Customer': customer_spend.index,'Amount_spend':customer_spend.values})

In [23]:
pd.options.display.float_format = '{:,.2f}'.format
df_customer_spend=pd.merge(df_customer,df_customer_spend,how='inner',on='Customer')
df_customer_spend.head()

Unnamed: 0,No,Customer,Age,City,Product,Limit,Company,Segment,Amount_spend
0,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,2202653.18
1,2,A2,56,CALCUTTA,Silver,300000,C2,Salaried_MNC,175463.27
2,3,A3,30,COCHIN,Platimum,540000,C3,Salaried_Pvt,533443.09
3,4,A4,22,BOMBAY,Platimum,840084,C4,Govt,904660.07
4,5,A5,59,BANGALORE,Platimum,420084,C5,Normal Salary,923534.79


In [24]:
df_customer_spend['Spending Limit Percentage']= round(df_customer_spend['Amount_spend']*100/df_customer_spend['Limit'],2)

In [25]:
print("Below customers are spending more than 90% limit:")
list(df_customer_spend[df_customer_spend['Spending Limit Percentage']>90]['Customer'])

Below customers are spending more than 90% limit:


['A1',
 'A3',
 'A4',
 'A5',
 'A6',
 'A7',
 'A8',
 'A9',
 'A11',
 'A12',
 'A13',
 'A14',
 'A15',
 'A16',
 'A17',
 'A18',
 'A19',
 'A20',
 'A21',
 'A22',
 'A23',
 'A24',
 'A25',
 'A26',
 'A27',
 'A28',
 'A30',
 'A31',
 'A33',
 'A35',
 'A36',
 'A37',
 'A38',
 'A39',
 'A40',
 'A41',
 'A42',
 'A44',
 'A45',
 'A46',
 'A47',
 'A48',
 'A49',
 'A50',
 'A51',
 'A52',
 'A53',
 'A54',
 'A56',
 'A57',
 'A58',
 'A59',
 'A60',
 'A61',
 'A62',
 'A69',
 'A70',
 'A71',
 'A96',
 'A99']

f. Which city has the maximum number of spenders, each month? Is there a need to run
campaigns in specific cities?


In [47]:
df_spend['month_name']=pd.DatetimeIndex(df_spend['Month']).month_name()
df_customer_spend.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   No                         100 non-null    int64  
 1   Customer                   100 non-null    object 
 2   Age                        100 non-null    int64  
 3   City                       100 non-null    object 
 4   Product                    100 non-null    object 
 5   Limit                      100 non-null    int64  
 6   Company                    100 non-null    object 
 7   Segment                    100 non-null    object 
 8   Amount_spend               100 non-null    float64
 9   Spending Limit Percentage  100 non-null    float64
dtypes: float64(2), int64(3), object(5)
memory usage: 8.6+ KB


In [48]:
df_spend['month_name']

0        January
1        January
2        January
3        January
4        January
          ...   
1495    February
1496       March
1497       March
1498       March
1499       March
Name: month_name, Length: 1500, dtype: object

In [55]:
df_customer_city = pd.merge(df_customer,df_spend,how='inner',on='Customer')

In [56]:
df_customer_city.head()

Unnamed: 0,No,Customer,Age,City,Product,Limit,Company,Segment,Sl No:,Month,Type,Amount,month_s,year_s,month_year,month_name
0,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,1,2004-01-12,JEWELLERY,344054.98,January,2004,1_2004,January
1,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,2,2004-01-03,PETRO,935.5,January,2004,1_2004,January
2,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,3,2004-01-15,CLOTHES,8687.9,January,2004,1_2004,January
3,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,4,2004-01-25,FOOD,341.16,January,2004,1_2004,January
4,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,5,2005-01-17,CAMERA,3406.64,January,2005,1_2005,January


In [64]:
by_city_spenders = df_customer_city.groupby(['month_name','City'])['Customer'].count()

In [73]:
print('City with maximum spenders by month')
by_city_spenders.unstack().idxmax(axis=1)

City with maximum spenders by month


month_name
April           COCHIN
August        CALCUTTA
December     BANGALORE
February     BANGALORE
January         COCHIN
July         BANGALORE
June          CALCUTTA
March        BANGALORE
May          BANGALORE
November        BOMBAY
October      BANGALORE
September       COCHIN
dtype: object

In [78]:
print('City with minimum spenders')
set(by_city_spenders.unstack().idxmin(axis=1).values)

City with minimum spenders by month


{'CHENNAI', 'DELHI', 'PATNA', 'TRIVANDRUM'}

Above cities have minimum spenders hence would need to run campign on these cities

g. Which age group spends the most?

In [89]:
bins=[0,30,40,50,60,70,80]
age_group = ['18-30','31-40','41-50','51-60','61-70','71-80']
df_customer_spend['age_group'] = pd.cut(x=df_customer_spend['Age'],
                     bins=bins,
                     labels=age_group)
df_customer_spend.sample(10)

Unnamed: 0,No,Customer,Age,City,Product,Limit,Company,Segment,Amount_spend,Spending Limit Percentage,age_group
88,89,A89,74,CALCUTTA,Gold,500000,C12,Govt,69396.31,13.88,71-80
34,35,A35,38,COCHIN,Platimum,600006,C17,Govt,747883.12,124.65,31-40
60,61,A61,63,COCHIN,Gold,1500000,C23,Govt,4113751.63,274.25,61-70
50,51,A51,39,BOMBAY,Platimum,300003,C13,Govt,567223.6,189.07,31-40
45,46,A46,29,PATNA,Silver,600000,C8,Govt,1722370.98,287.06,18-30
16,17,A17,26,BOMBAY,Gold,500000,C17,Self Employed,653450.66,130.69,18-30
4,5,A5,59,BANGALORE,Platimum,420084,C5,Normal Salary,923534.79,219.85,51-60
0,1,A1,47,BANGALORE,Gold,1500000,C1,Self Employed,2202653.18,146.84,41-50
14,15,A15,41,CALCUTTA,Gold,500000,C15,Govt,832219.37,166.44,41-50
44,45,A45,65,COCHIN,Gold,1500000,C7,Salaried_Pvt,1645941.65,109.73,61-70


In [91]:
print("The age group which spends the most is ",df_customer_spend.groupby(['age_group'])['Amount_spend'].sum().idxmax())

The age group which spends the most is  61-70
