### Importing the dependencies!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

ModuleNotFoundError: No module named 'sklearn'

### Importing the dataset-

In [None]:
CC_data = pd.read_csv('UCI_Credit_Card.csv')

### Exploring the data-

In [None]:
CC_data.info()

    - 25 columns
    - No null/missing values
    - Target variable => default.payment.next.month (Column 25) 

In [None]:
CC_data.iloc[:, 1:].describe().T # Ignores the ID column

### Takeaways: 
    - 30,000 rows in the dataset. Target variable has two values- 0, and 1.
    - Age varies between 21 and 79
### Questions to answer:
    - Does age play a role in credit card default
    - Are more men going default then women, or is the gender division is the same for defaulters.
    - Does education lead to less defaults.
    - Prediction of future defaults based on given data.

In [None]:
CC_data.head()

In [None]:
CC_data.tail()

### Visited the website to read more about the dataset:
    - Gender: 1-Male, and 2-Female
    - LIMIT_BAL: Amount of given credit in NT dollars
    - Education: 1-Graduate School, 2-University, 3-High School, 4-Others, 5, and 6- Unknown
    - Marriage: 1- Married, 2-Single, 3-Others
    - Pay_X (Repayment status): -1-Pay Duly, 1-Payment delay for one month, 2- Delay for two months, ..., 9-Payment delay for 9 months
    - Bill_AMTX: Bill amount generated
    - Pay_AMTX: Amount of previous payment
    - Default payment: 1-Yes, and 0-No   

### Cleaning the dataset

In [None]:
CC_data['EDUCATION'].value_counts()

In [None]:
# Education column has both 5 and 6 for unknown, combining both of them. Also, don't know what 0 is here. Merging that with 5 as well.
CC_data["EDUCATION"].replace((0, 6), 5 , inplace=True)
CC_data['EDUCATION'].value_counts()

In [None]:
CC_data['MARRIAGE'].value_counts()

In [None]:
# Merging 0 and 3 for the Marriage feature.
CC_data['MARRIAGE'].replace((0), 3, inplace=True)
CC_data['MARRIAGE'].value_counts()

In [None]:
CC_data.isnull().sum() # Double-checking for the null values.

### Exploratory Data Analysis

In [None]:
sns.pairplot(data=CC_data[['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'default.payment.next.month']])
plt.show()

In [None]:
CC_data.corr()

    - Checking for multi-collinearity.
    - The bill amounts are highly correlated. This makes some sense! Keeping the data as it is, can use feature engineering to get better results.
    - Rest of the data looks good.

In [None]:
# We can also change the column names for PAY_0, and our target variable.
CC_data.rename(columns={'default.payment.next.month':'default_pay', 'PAY_0':'PAY_1'}, inplace=True)

    - Is there any relation between age, and the account balance.
    - The distribution of dataset, how many defaulters.

In [None]:
defaut_count = (CC_data['default_pay'].value_counts(normalize=True))*100
defaut_count.plot(kind='bar', figsize=(6, 6), title='Dataset Ditribution')
plt.xticks(rotation=0)
for x, y in zip([0, 1], defaut_count):
    plt.text(x, y, y)
plt.show()

In [None]:
sns.relplot(data=CC_data, x='AGE', y='LIMIT_BAL', kind='line')
plt.title('Account Balance related to Age')
plt.show()

    - More than 70% of population did not default.
    - With age the income seems to grow, with an exception of age around 60 years. 

In [None]:
bins = [20,30,40,50,60,70,80]
names = ['21-30','31-40','41-50','51-60','61-70','71-80']
CC_data['Age_distribution'] = pd.cut(x=CC_data['AGE'], bins=bins, labels=names, right=True)
# print(CC_data.Age_distribution)

# age_cnt = CC_data['Age_distribution'].value_counts()
age_0 = (CC_data['Age_distribution'][CC_data['default_pay']==0]).value_counts()
age_1 = (CC_data['Age_distribution'][CC_data['default_pay']==1]).value_counts()

plt.subplots(figsize=(8, 5))
plt.bar(age_0.index, age_0.values, label='0')
plt.bar(age_1.index, age_1.values, label='1')

for x, y in zip(names, age_0):
    plt.text(x, y, y)
for x, y in zip(names, age_1):
    plt.text(x, y, y)
plt.title('Number of clients in each age group')
plt.legend(loc='upper right')
plt.show()


    - We have maximum clients from 21-30 age group with a very close comparision with the second age group, 31-40. Hence, it can be observed that with increasing age group the number of clients that will default the payment next month is decreasing. AGE is important feature to predict the default payment for next month.

In [None]:
sns.barplot(data=CC_data, x='Age_distribution', y='LIMIT_BAL', hue='default_pay', errorbar=('ci', 0))
plt.legend(loc='upper left')
plt.show()

In [None]:
sns.barplot(data=CC_data, x='default_pay', y='AGE', hue='MARRIAGE')
plt.show()

In [None]:
graph = sns.FacetGrid(CC_data, row='default_pay', col='MARRIAGE') # Making a grid! Subplot for each and every value for row, and column variable.
graph.map(plt.hist, 'AGE', color='orange')
plt.show()

    - Married person between 30 and 50, and unmarried clients of age group 20-30 tend to default payment. With unmarried clients having a higher probability to default payment. Hence including MARRIAGE to find probability of defaulting the payment next month can be useful.

In [None]:
graph = sns.FacetGrid(CC_data, row='default_pay', col='SEX')
graph.map(plt.hist, 'AGE', color='orange')
plt.show()

    - Females from age group 20-30 have very high tendency to default the payment as oposed to males in all age brackets.

In [None]:
plt.figure(figsize=(10, 4))
edu_count = sns.countplot(data=CC_data, x='EDUCATION', hue='default_pay')
edu_count.set_xticklabels(['School_Grad','University_Grad','High School_Grad','Other',"UNKNOWN"])
plt.show()

    - Even though university graduates tend to default the least among other groups, they also tend to default the most when compared to other people with different level of education. It might mean that university graduates may have more credit cards then other people. Have to check this relation as well.

In [None]:
plt.figure(figsize=(12,4)) 
sns.set(font_scale=1)
# kernel density estimation
sns.kdeplot(CC_data.loc[(CC_data['default_pay'] == 0), 'LIMIT_BAL'], label='Did not default', fill=True)
sns.kdeplot(CC_data.loc[(CC_data['default_pay'] == 1), 'LIMIT_BAL'], label='Default', fill=True)
plt.ticklabel_format(style='plain', axis='x') 
plt.ylabel('')
plt.legend()
plt.show()

    - Most of the population has less than 200K credit limit.
    - Most of the defaults fall within the population. 

In [None]:
CC_data['LimitBin'] = pd.cut(CC_data['LIMIT_BAL'],[5000, 50000, 100000, 150000, 200000, 300000, 400000, 500000, 1100000])

In [None]:
plt.figure(figsize=(14,4))
sns.set(font_scale=1)
CC_data['LimitBin'] = CC_data['LimitBin'].astype('str') # astype() : cast a pandas object to a specified dtype

LimitBin_order = ['(5000, 50000]', '(50000, 100000]', '(100000, 150000]', '(150000, 200000]',
                '(200000, 300000]', '(300000, 400000]', '(400000, 500000]', '(500000, 1100000]']

ax = sns.countplot(data=CC_data, x='LimitBin' , hue="default_pay", order = LimitBin_order)

plt.xlabel("Amount of Given Credit", fontsize= 12)
plt.ylabel("# of Clients", fontsize= 12)
plt.ylim(0,8000)
ax.tick_params(axis="x", labelsize= 9.5)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.075, p.get_height()+300))
    
plt.show()

In [None]:
plt.figure(figsize=(14,4))
sns.set(font_scale=1)
ax = sns.barplot(x = "LimitBin", y = "default_pay", data = CC_data, order = LimitBin_order, errorbar=None)

 
plt.xlabel("Amount of Given Credit", fontsize= 12)
plt.ylabel("% of Default", fontsize= 12)
plt.ylim(0, 0.4)

for p in ax.patches:
    ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.25, p.get_height()+0.03), fontsize=13)

plt.show()

    - More credit, lesser the chance of default.
    - Over 30% users with a credit less than 50k defaulted.

### Data Preparation!

In [None]:
CC_data.rename(columns=lambda x: x.lower(), inplace=True)

In [None]:
# Education
CC_data['grad_school'] = (CC_data['education'] == 1).astype('int')
CC_data['university'] = (CC_data['education'] == 2).astype('int')
CC_data['high_school'] = (CC_data['education'] == 3).astype('int')
CC_data.drop('education', axis=1, inplace=True)

# Gender
CC_data['male'] = (CC_data['sex']==1).astype('int')
CC_data.drop('sex', axis=1, inplace=True)

# Married
CC_data['married'] = (CC_data['marriage'] == 1).astype('int')
CC_data.drop('marriage', axis=1, inplace=True)

# For pay features if the <= 0 then it means it was not delayed
pay_features = ['pay_1','pay_2','pay_3','pay_4','pay_5','pay_6']
for p in pay_features:
    CC_data.loc[CC_data[p]<=0, p] = 0

In [None]:
CC_data.head(10)

In [None]:
CC_data .drop('limitbin', axis=1, inplace=True)
CC_data.drop('age_distribution', axis=1, inplace=True)
CC_data.drop('id', axis=1, inplace=True)

In [None]:
x = CC_data.drop('default_pay', axis=1)
y = CC_data['default_pay']

In [None]:
x.head()

In [None]:
y.head()