In [None]:
## Import our Modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
## Load in Our Data
df = pd.read_csv("AER_mess.csv")

In [None]:
## Inspect the Data
df.head(5)

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df['months'] = df['months'].replace('\.', np.nan, regex=True)

In [None]:
df.months = pd.to_numeric(df.months)

In [None]:
df['months'].describe()

In [None]:
plt.hist(df.months)
plt.show()

In [None]:
## We can use these functions to fill in the missing data. Use the mean if histograms look symmetrical
# df.fillna(df.mean())

## Use the median if looks skewed
df['months']=df['months'].fillna(df['months'].median())

In [None]:
df['months'].describe()

In [None]:
## One Dimensional Stats
df.card.value_counts()

In [None]:
df.owner.value_counts()

In [None]:
df.selfemp.value_counts()

In [None]:
## Looks like something weird is going on with this one. Let's replace so it's uniform
df['selfemp'] = df['selfemp'].replace('no.+', 'no', regex=True)

In [None]:
## Looks better
df.selfemp.value_counts()

In [None]:
## One last look. This looks fine
df.describe()

In [None]:
## Two and Multi-Dimensional Stats
## Correlations give a good idea of how data interacts with each other. Good way to notice any patterns
df.corr()

In [None]:
sns.pairplot(df,x_vars=['reports','age','income','expenditure','dependents'],y_vars=['reports','age','income','expenditure','dependents'],corner=True)
plt.show()

In [None]:
## Transforming Data

## We may want to create new factors from existing data. Let's make annual CC expenditures as a percent of a person's
## total income
df['exp_income_ratio'] = (df['expenditure']*12)/(df['income']*10000)*100

In [None]:
df['exp_income_ratio'].describe()

In [None]:
## Normalizing Data is useful for changing the scale into standard normal
df['norm_income']= (df.income-df.income.mean())/df.income.std(ddof=1)

In [None]:
## Log Normalizing is another popular method for highly skewed data
df['ln_income']=np.log(df.income)

In [None]:
plt.hist(df.income)
plt.show()

In [None]:
plt.hist(df.norm_income)
plt.show()

In [None]:
plt.hist(df.ln_income)
plt.show()

In [None]:
## Some models only take numerical values as inputs. We often need to transform catergorical variables
## into 1/0 to be used in these models
pd.get_dummies(df.card)

In [None]:
df['approved']=pd.get_dummies(df.card)['yes']

In [None]:
## Sometimes we may want to make categorical variables from numeric
## This would turn income into differnt brackets
df['bracket'] = pd.cut(df['income'], bins=[0, 5, 13, float('Inf')], labels=['low', 'middle', 'upper'])

In [None]:
df.bracket.value_counts()

In [None]:
## Principle Component Analysis
features = ['reports', 'age', 'income', 'expenditure','dependents','months','majorcards','active']
# Separating out the features
x = df.loc[:, features].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

In [None]:
principalDf