In [None]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Regression modelling
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

### Simple linear regression
In the first part, we will see how to perform linear regression with a single predictor variable. The task is to predict the market value of a real estate using the home size as the predictor.

In [None]:
# Importing housing data

house_data = pd.read_csv('estate.csv', sep = ';')
house_data.head()

In [None]:
# Looking at the relationship
# We can confirm that there is an observable linear relationship

sns.jointplot(x ='Market_Value', y = 'Home_size', data = house_data)

In [None]:
# Separate predictors and target, as they will need to be provided separately
# when creating the model. The predictor(s) as a dataframe, target as series

house_X, house_y = house_data[['Home_size']], house_data.Market_Value

# Create linear regression object, a placeholder for the model to be built

regr = linear_model.LinearRegression()

# Train the model using the predictor and target data

regr.fit(house_X, house_y)

# The coefficient and intercept can be printed as the two components intercept_ 
# and coef_ of the created model
print(regr.intercept_)
print(regr.coef_)

In [None]:
# Making predictions by using .predict on the created model
house_y_pred = regr.predict(house_X)

# As we can see, some predictions are quite close, some are far from the original value

print(house_y[:5])
print(pd.Series(house_y_pred[:5]).astype('int'))

In [None]:
# The mean squared error, and we know that the created model is optimal
# so it is not possible to find another model for which this error would be lower

print(mean_squared_error(house_y, house_y_pred))

# The coefficient of determination: 1 is perfect prediction

print(r2_score(house_y, house_y_pred))

In [None]:
# In order to get the results of statistical tests, it is more convenient the statsmodels library

X2 = sm.add_constant(house_X)
est = sm.OLS(house_y, X2)
est2 = est.fit()

print(est2.summary())

In [None]:
# Creting the residual plot, i.e. the difference between the original values and the predictions

residuals = house_y - house_y_pred

sns.scatterplot(house_data['Home_size'], residuals)

In [None]:
# Residual histogram
# We can see that is looks like a normal distribution

sns.distplot(residuals)

### Multiple linear regression
In this section we will look at how to build regression models with multiple predictor variables.

In [None]:
# Importing bank data
bank_data = pd.read_csv('bank.csv', delimiter=";", decimal=",")
bank_data.head()

In [None]:
# With scikit learn, we have the same steps
# the only difference is that the dataframe of predictors will have multiple columns

# Separate predictors and target
bank_X, bank_y = bank_data[bank_data.columns[:5]], bank_data.Balance


# Create linear regression object
regr_bank = linear_model.LinearRegression()

# Train the model using the training sets
regr_bank.fit(bank_X, bank_y)

# The coefficient and intercept
print(regr_bank.intercept_)
print(regr_bank.coef_)

In [None]:
# With statsmodels, similarly as above
# We can observe more details here
# In particular, we can easily identify variables with not significant coefficients

X2 = sm.add_constant(bank_X)
est_b = sm.OLS(bank_y, X2)
est2_b = est_b.fit()

print(est2_b.summary())

In [None]:
# Remove try to build a model without Home value, that was found not significant above
# As we can see, the quality of the model remains the same

X2 = sm.add_constant(bank_X.drop(['Home Value'], axis = 1))
est_b = sm.OLS(bank_y, X2)
est2_b = est_b.fit()

print(est2_b.summary())

In [None]:
# Check correlation
# Very high for Wealth and Income, suggesting that we could remove one

bank_X.corr()

In [None]:
# Remove Home value and Wealth/Income

X2 = sm.add_constant(bank_X.drop(['Home Value', 'Wealth'], axis = 1))
est_b = sm.OLS(bank_y, X2)
est2_b = est_b.fit()

print(est2_b.summary())

### Regression with categorical variables

In [None]:
# Importing data
salary_data = pd.read_csv('salary.csv', delimiter=";", decimal=",")
salary_data.head()

In [None]:
# We can create one-hot encoding to transform MBA into a 0-1 column

salary_data = pd.get_dummies(salary_data, columns=['MBA'], drop_first=True, prefix='MBA')

salary_data.head()

In [None]:
# Create the model in the same steps

salary_X, salary_y = salary_data[['Age', 'MBA_Yes']], salary_data.Salary

X2 = sm.add_constant(salary_X)
est_s = sm.OLS(salary_y, X2)
est2_s = est_s.fit()

print(est2_s.summary())