# Foreseeing Variable Problems When Building ML Models

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# to display the total number columns present in the dataset
pd.set_option('display.max_columns', None)

## Identifying numerical and categorical variables

In [None]:
def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan 

In [None]:
data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
data = data.replace('?', np.nan)
data['cabin'] = data['cabin'].apply(get_first_cabin)
data.to_csv('data/titanic.csv', index=False)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data = pd.read_csv('data/titanic.csv')
data.dtypes

In [None]:
data['sibsp'].unique()

In [None]:
data['fare'].unique()[0:20]

In [None]:
data.nunique()

In [None]:
data['embarked'].unique()

In [None]:
# inspect unique values - mixed variable
data['cabin'].unique()[0:20]

In [None]:
data['sibsp'].hist(bins=20)
plt.show()

In [None]:
data['fare'].hist(bins=50)
plt.show()

In [None]:
# bar plots for categorical variables
data['embarked'].value_counts().plot.bar()
plt.xticks(rotation=0)
plt.ylabel('Number of passengers')
plt.title('embakred - port')
plt.show()

## Quantifying missing data

- Visit the following website: https://archive.ics.uci.edu/ml/machinelearning-databases/kddcup98-mld/epsilon_mirror/.
- Click the `cup98lrn.zip` link to begin the download.
- Unzip the file and save `cup98LRN.txt` in the same folder where you'll run the commands of the recipes.

In [None]:
cols = ['AGE', 'NUMCHLD', 'INCOME', 'WEALTH1', 'MBCRAFT', 'MBGARDEN', 'MBBOOKS', 'MBCOLECT', 'MAGFAML','MAGFEM', 'MAGMALE']
# load the dataset
data = pd.read_csv('data/cup98LRN.txt', usecols=cols)

# let's inspect the first 5 rows
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.isnull().mean() * 100

In [None]:
data.isnull().mean().plot.bar(figsize=(8,4))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')
plt.show()

## Determining cardinality in categorical variables

**The number of unique categories in a variable is called cardinality.**

In [None]:
cols = ['GENDER', 'RFA_2', 'MDMAUD_A', 'RFA_2', 'DOMAIN', 'RFA_15']
data = pd.read_csv('data/cup98LRN.txt', usecols=cols)
data.head()

In [None]:
data = data.replace(' ', np.nan)
data.head()

In [None]:
data.nunique()

In [None]:
data['GENDER'].unique()

In [None]:
data.nunique().plot.bar(figsize=(8,4))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')
plt.show()

## Pinpointing rare categories in categorical variables

In [None]:
data = pd.read_csv('data/car.data', header=None)
data.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
data.head()

In [None]:
data['class'].unique()

In [None]:
label_freq = data['class'].value_counts() / len(data) * 100
print(label_freq)

In [None]:
fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=5, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

## Identifying a linear relationship

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
np.random.seed(29)
x = np.random.randn(200)

In [None]:
y = x * 10 + np.random.randn(200) * 2

In [None]:
data = pd.DataFrame([x, y]).T
data.columns = ['x', 'y']

In [None]:
# we used the seaborn lmplot() method, which allows us to plot the data and fit and display a linear model on top of it
sns.lmplot(x="x", y="y", data=data, order=1)
plt.ylabel('Target')
plt.xlabel('Independent variable')
plt.show()

In [None]:
linreg = LinearRegression()
linreg.fit(data['x'].to_frame(), data['y'])

In [None]:
# Make predictions of y using the fitted linear model:
predictions = linreg.predict(data['x'].to_frame())

# Calculate the residuals, that is, the difference between the predictions and the real outcome, y:
residuals = data['y'] - predictions

# Make a scatter plot of the independent variable x and the residuals:
plt.scatter(y=residuals, x=data['x'])
plt.ylabel('Residuals')
plt.xlabel('Independent variable x')
plt.show()

In [None]:
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel('Residuals')
plt.show()

### Example: Boston House price data

In [None]:
# load the the Boston House price data from scikit-learn
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
boston = pd.DataFrame(data, columns=feature_names)
# add the target
boston['MEDV'] = target
boston.to_csv("data/boston.csv", index=False)
boston.head()

In [None]:
# we plot the variable LAST (% lower status of the population)
# vs the target MEDV (median value of the house)
sns.lmplot(x="LSTAT", y="MEDV", data=boston, order=1)
plt.show()

In [None]:
# now we plot CRIM (per capita crime rate by town)
# vs the target MEDV (median value of the house)

sns.lmplot(x="CRIM", y="MEDV", data=boston, order=1)
plt.show()

In [None]:
# now we do the same for the variable LSTAT of the boston
# house price dataset from sklearn

# call the linear model from sklearn
linreg = LinearRegression()

# fit the model
linreg.fit(boston['LSTAT'].to_frame(), boston['MEDV'])

# make the predictions
pred = linreg.predict(boston['LSTAT'].to_frame())

# calculate the residuals
error = boston['MEDV'] - pred

In [None]:
# Residuals plot

# if the relationship is linear, the noise should be
# random, centered around zero, and follow a normal distribution

plt.scatter(y=error, x=boston['LSTAT'])
plt.ylabel('Residuals')
plt.xlabel('LSTAT')
plt.show()

In [None]:
# plot a histogram of the residuals
# they should follow a gaussian distribution
sns.histplot(error, bins=30, kde=True)
plt.show()

## Identifying a normal distribution

**Linear models assume that the independent variables are normally distributed. Failure to
meet this assumption may produce algorithms that perform poorly.**

In [None]:
np.random.seed(29)
x = np.random.randn(200)

In [None]:
data = pd.DataFrame([x]).T
data.columns = ['x']

In [None]:
sns.histplot(data['x'], bins=30, kde=True)
plt.show()

In [None]:
stats.probplot(data['x'], dist="norm", plot=plt)
plt.show()

### Example: Boston House price data

In [None]:
boston.head()

In [None]:
# histogram of the variable RM from the boston
# house price dataset from sklearn
# RM is the average number of rooms per dwelling

stats.probplot(boston['RM'], dist="norm", plot=plt)
plt.show()

In [None]:
# just for comparison, let's go ahead and plot CRIM
stats.probplot(boston['CRIM'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.histplot(boston['CRIM'], bins=30, kde=True)
plt.show()

## Distinguishing variable distribution

**A probability distribution is a function that describes the likelihood of obtaining the
possible values of a variable.**



In [None]:
boston.hist(bins=30, figsize=(12,12), density=True)
plt.show()

## Highlighting outliers

**An outlier is a data point that is significantly different from the remaining data.**



In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y=boston['RM'])
plt.title('Boxplot')
plt.show()

In [None]:
def find_boundaries(df, variable, distance):
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)
    return upper_boundary, lower_boundary

In [None]:
upper_boundary, lower_boundary = find_boundaries(boston, 'RM', 1.5)
upper_boundary, lower_boundary

In [None]:
outliers = np.where(boston['RM'] > upper_boundary, True, np.where(boston['RM'] < lower_boundary, True, False))

In [None]:
outliers_df = boston.loc[outliers, 'RM']
outliers_df.head()

## Comparing feature magnitude

In [None]:
boston.describe()

In [None]:
boston.max() - boston.min()