# Foreseeing Variable Problems When Building ML Models

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# to display the total number columns present in the dataset
pd.set_option('display.max_columns', None)

## Identifying numerical and categorical variables

In [None]:
file_name = "data/boston_listings.csv"
airbnb_boston = pd.read_csv(file_name)

In [None]:
airbnb_boston.head()

In [None]:
airbnb_boston.shape

In [None]:
airbnb_boston.info()

In [None]:
airbnb_boston.dtypes

In [None]:
airbnb_boston.dtypes.value_counts()

In [None]:
airbnb_boston['bedrooms'].unique()

In [None]:
airbnb_boston['price'].unique()[0:20]

In [None]:
airbnb_boston.nunique()

In [None]:
airbnb_boston['host_response_time'].unique()

In [None]:
airbnb_boston['number_of_reviews'].hist(bins=20)
plt.show()

In [None]:
airbnb_boston['bedrooms'].value_counts().plot.bar()
plt.show()

In [None]:
airbnb_boston['price'][:5] # problam je dolar signa ki ga moremo odstranit

In [None]:
airbnb_boston['price_usd'] = airbnb_boston['price'].str.replace("$", "").str.replace(",", "").astype("float")
airbnb_boston['cleaning_fee_usd'] = airbnb_boston['cleaning_fee'].str.replace("$", "").str.replace(",", "").astype("float")
airbnb_boston.drop(columns=["price", "cleaning_fee"], inplace=True)
sns.histplot(airbnb_boston['price_usd'], bins=50)
plt.show()

In [None]:
# bar plots for categorical variables
airbnb_boston['host_response_time'].value_counts().plot.bar()
plt.xticks(rotation=0)
plt.ylabel('Number of hosts')
plt.title('Response time')
plt.show()

## Quantifying missing data

In [None]:
airbnb_boston.isnull().sum()

In [None]:
airbnb_boston.isnull().mean() * 100

In [None]:
airbnb_boston.isnull().mean().plot.bar(figsize=(8,4))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')
plt.show()

In [None]:
columns_to_remove = ["id", "name", "summary", "access", "interaction", "host_id", "host_verifications", "license"]
airbnb_boston.drop(columns=columns_to_remove, inplace=True)
airbnb_boston.head()

## Determining cardinality in categorical variables

In [None]:
airbnb_boston.nunique()

In [None]:
airbnb_boston['property_type'].unique()

In [None]:
airbnb_boston.nunique().plot.bar(figsize=(8,4))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')
plt.show()

## Pinpointing rare categories in categorical variables

In [None]:
airbnb_boston['property_type'].unique()

In [None]:
label_freq = airbnb_boston['property_type'].value_counts() / len(airbnb_boston) * 100
print(label_freq)

In [None]:
fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=5, color='red')
fig.set_ylabel('percentage of each category')
fig.set_xlabel('Variable: property_type')
fig.set_title('Identifying Rare Categories')
plt.show()

## Identifying a linear relationship

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
np.random.seed(29)
x = np.random.randn(200)

In [None]:
y = x * 10 + np.random.randn(200) * 2

In [None]:
data = pd.DataFrame([x, y]).T
data.columns = ['x', 'y']

In [None]:
# we used the seaborn lmplot() method, which allows us to plot the data and fit and display a linear model on top of it
sns.lmplot(x="x", y="y", data=data, order=1)
plt.ylabel('Target')
plt.xlabel('Independent variable')
plt.show()

In [None]:
linreg = LinearRegression()
linreg.fit(data['x'].to_frame(), data['y'])

In [None]:
# Make predictions of y using the fitted linear model:
predictions = linreg.predict(data['x'].to_frame())

# Calculate the residuals, that is, the difference between the predictions and the real outcome, y:
residuals = data['y'] - predictions

# Make a scatter plot of the independent variable x and the residuals:
plt.scatter(y=residuals, x=data['x'])
plt.ylabel('Residuals')
plt.xlabel('Independent variable x')
plt.show()

In [None]:
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel('Residuals')
plt.show()

### Example: Boston House price data

In [None]:
# we plot the variable LAST (% lower status of the population)
# vs the target MEDV (median value of the house)
sns.lmplot(x="bedrooms", y="price_usd", data=airbnb_boston, order=1)
plt.show()

In [None]:
sns.lmplot(x="latitude", y="price_usd", data=airbnb_boston, order=1)
plt.show()

## Identifying a normal distribution

In [None]:
np.random.seed(29)
x = np.random.randn(200)

In [None]:
data = pd.DataFrame([x]).T
data.columns = ['x']

In [None]:
sns.histplot(data['x'], bins=30, kde=True)
plt.show()

In [None]:
stats.probplot(data['x'], dist="norm", plot=plt)
plt.show()

### Example: Boston House price data

In [None]:
stats.probplot(airbnb_boston['latitude'], dist="norm", plot=plt)
plt.show()

In [None]:
stats.probplot(airbnb_boston['number_of_reviews'], dist="norm", plot=plt)
plt.show()

In [None]:
sns.histplot(airbnb_boston['number_of_reviews'], bins=30, kde=True)
plt.show()

## Distinguishing variable distribution

In [None]:
airbnb_boston.hist(bins=30, figsize=(12,12), density=True)
plt.show()

## Highlighting outliers

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y=airbnb_boston['reviews_per_month'])
plt.title('Boxplot')
plt.show()

In [None]:
def find_boundaries(df, variable, distance):
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)
    return upper_boundary, lower_boundary

In [None]:
upper_boundary, lower_boundary = find_boundaries(airbnb_boston, 'price_usd', 3)
upper_boundary, lower_boundary

In [None]:
outliers = np.where(airbnb_boston['price_usd'] > upper_boundary, True, np.where(airbnb_boston['price_usd'] < lower_boundary, True, False))

In [None]:
outliers_df = airbnb_boston.loc[outliers, 'price_usd']
outliers_df.head()

## Comparing feature magnitude

In [None]:
airbnb_boston.describe()

In [None]:
airbnb_boston.select_dtypes(exclude="object").max() - airbnb_boston.select_dtypes(exclude="object").min()