# Prediction of Housing Affordability
## Based on Wage Growth and Inflation in the EU
This project implements data engineering, EDA, and predictive modeling as outlined in the project proposal.

In [2]:
# Install necessary libraries
%pip install pandas numpy matplotlib seaborn scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


## Step 1: Data Engineering
### Load the Data

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load the dataset  
data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv')


In [6]:
# Visualize the data
print("Inflation Rates: ")
print(inflation_rates.head())
print("\nGDP per Capita: ")
print(gdp_per_capita.head())
print("\nPrice to Income Ratio: ")
print(price_to_income_ration.head())

Inflation Rates: 
                         Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3  \
0                               NaN        NaN        NaN        NaN   
1                               NaN        NaN        NaN        NaN   
2                               NaN        NaN        NaN        NaN   
3                               NaN        NaN        NaN        NaN   
4  HICP - inflation rate [tec00118]        NaN        NaN        NaN   

  Unnamed: 4  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  

GDP per Capita: 
                        Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4
0                              NaN        NaN        NaN        NaN        NaN
1                              NaN        NaN        NaN        NaN        NaN
2                              NaN        NaN        NaN        NaN        NaN
3                              NaN        NaN        NaN        NaN        NaN
4  Real GDP per capita [sdg_08_10]        NaN 

In [7]:
# Check for missing values
print("Inflation Rates: ")
print(inflation_rates.isnull().sum())
print("\nGDP per Capita: ")
print(gdp_per_capita.isnull().sum())
print("\nPrice to Income Ratio: ")
print(price_to_income_ration.isnull().sum())

Inflation Rates: 
Unnamed: 0    14
Unnamed: 1    13
Unnamed: 2    14
Unnamed: 3    14
Unnamed: 4    17
dtype: int64

GDP per Capita: 
Unnamed: 0    15
Unnamed: 1    13
Unnamed: 2    14
Unnamed: 3    14
Unnamed: 4    17
dtype: int64

Price to Income Ratio: 
Unnamed: 0    12
Unnamed: 1    10
Unnamed: 2    12
Unnamed: 3    11
dtype: int64


In [8]:
# Drop missing values
inflation_rates = inflation_rates.dropna()
gdp_per_capita = gdp_per_capita.dropna()
price_to_income_ration = price_to_income_ration.dropna()

# Checck for the same countries in all dataframes
print(inflation_rates['Country'].unique())
print(gdp_per_capita['Country'].unique())
print(price_to_income_ration['Country'].unique())

KeyError: 'Country'

In [None]:
# Drop the countries that are not in all dataframes
inflation_rates = inflation_rates[inflation_rates['Country'].isin(gdp_per_capita['Country'])]
inflation_rates = inflation_rates[inflation_rates['Country'].isin(price_to_income_ration['Country'])]
gdp_per_capita = gdp_per_capita[gdp_per_capita['Country'].isin(inflation_rates['Country'])]

# Chech for missing values
print(inflation_rates.isnull().sum())   
print(gdp_per_capita.isnull().sum())
print(price_to_income_ration.isnull().sum())

In [None]:
# Fill missing values with the mean 
inflation_rates = inflation_rates.fillna(inflation_rates.mean())
gdp_per_capita = gdp_per_capita.fillna(gdp_per_capita.mean())
price_to_income_ration = price_to_income_ration.fillna(price_to_income_ration.mean())

In [2]:
# Check if there is any cleaning left
print(inflation_rates.isnull().sum())
print(gdp_per_capita.isnull().sum())
print(price_to_income_ration.isnull().sum())

In [None]:
# Merge the dataframes
data = pd.merge(inflation_rates, gdp_per_capita, on='Country')
data = pd.merge(data, price_to_income_ration, on='Country')

# Drop the country column
data = data.drop('Country', axis=1)

## Step 2: Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

combined_data = data
correlation_matrix = combined_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Step 3: Predictive Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Placeholder: Define features and target
X = data.drop('Inflation Rate', axis=1)
y = data['Inflation Rate']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
print('R^2 Score:', r2_score(y_test, y_pred))


## Step 4: Visualizations
### Trends and Disparities

In [None]:
# Example: Line plot for trends
plt.figure(figsize=(10, 6))
plt.plot(data['Year'], data['Inflation Rate'], label='Inflation Rate')
plt.plot(data['Year'], data['GDP per Capita'], label='GDP per Capita')
plt.plot(data['Year'], data['Price to Income Ratio'], label='Price to Income Ratio')
plt.xlabel('Year')
plt.ylabel('Value')
plt.title('Trends')
plt.legend()

plt.show()

# Example: Scatter plot for relationships
plt.figure(figsize=(10, 6))
plt.scatter(data['GDP per Capita'], data['Inflation Rate'])
plt.xlabel('GDP per Capita')
plt.ylabel('Inflation Rate')
plt.title('Relationship between GDP per Capita and Inflation Rate')
plt.show()


# Example: Bar plot for comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Year', y='Inflation Rate', data=data)
plt.xlabel('Year')
plt.ylabel('Inflation Rate')
plt.title('Inflation Rate by Year')
plt.show()

# Example: Histogram for distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Inflation Rate'], kde=True)
plt.xlabel('Inflation Rate')
plt.ylabel('Frequency')
plt.title('Inflation Rate Distribution')
plt.show()

# Example: Box plot for comparison
plt.figure(figsize=(10, 6))
sns.boxplot(x='Year', y='Inflation Rate', data=data)
plt.xlabel('Year')
plt.ylabel('Inflation Rate')
plt.title('Inflation Rate by Year')
plt.show()

# Example: Violin plot for comparison
plt.figure(figsize=(10, 6))
sns.violinplot(x='Year', y='Inflation Rate', data=data)
plt.xlabel('Year')
plt.ylabel('Inflation Rate')
plt.title('Inflation Rate by Year')
plt.show()