# Chapter 10: How to make predictions with a linear regression model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Get the data

In [None]:
housingPath = 'housingData.csv'
housing = pd.read_csv(housingPath)

In [None]:
vc = housing.floors.value_counts()
type(vc)

In [None]:
NEW_Df = pd.DataFrame({"floors":housing.floors})

In [None]:
vc_df = pd.DataFrame({"floor_values":vc.index, "row_count":vc.values})
vc_df.row_count.describe()

In [None]:
pd.

In [None]:
type(housing)

In [None]:
housing.describe().T

In [None]:
for c in housing.columns:
    plt.title("Plot of "+c,fontsize=15)
    plt.hist(housing[c],bins=20)
    plt.show()

In [None]:
housing.shape

In [None]:
len(housing.query('sqft_living >= 8000 or price >= 1000000' ))/len(housing)

In [None]:
len(housing.query('sqft_living < 8000 and price < 1000000 and price > 0'))/len(housing)

In [None]:
housing.query('floors == 3.5').describe().T

In [None]:
ax = sns.boxplot(data=housing.query('sqft_living < 8000 and price < 1000000 and price > 0'), x = 'floors',y = 'price')

In [None]:
housing = housing.query('sqft_living < 8000 and price < 1000000 and price > 0')
housing.shape

In [None]:
#non-lambda
def nonlambda (variable): 
    return True if variable > 0 else False

housing['has_basement'] = housing['sqft_basement'].apply(
nonlambda
    )
housing.head(10)

In [None]:
housing['has_basement'] = housing['sqft_basement'].apply(
    lambda x: True if x > 0 else False)
housing.head(10)

In [None]:
ax = sns.boxplot(data=housing.query('sqft_living < 8000 and price < 1000000 and price > 0'), x = 'has_basement',y = 'price')

In [None]:
housing.sqft_living.hist(bins=20)

In [None]:
housing.info()

In [None]:
housing = housing.drop(columns=['date','street','city','statezip','country',
                                'sqft_lot','yr_renovated','sqft_basement'])

## The Housing dataset

In [None]:
housing.info()

In [None]:
housing.head()

## How to identify correlations with a scatterplot

In [None]:
sns.relplot(data=housing, x='sqft_living', y='price', hue="floors")

In [None]:
sns.relplot(data=housing, x='sqft_living', y='bathrooms')

## How to identify correlations with a grid of scatterplots

In [None]:
sns.pairplot(data=housing,
             y_vars=['price','sqft_living','sqft_above'], 
             x_vars=['price','sqft_living','sqft_above']
             #diag_kind='kde'
             )

## How to identify correlations with r-values

In [None]:
# r = coefficient of correlation

housing.corr()

In [None]:
""" 
price	1.000000
sqft_living	0.607379
sqft_above	0.517395
bathrooms	0.460155
bedrooms	0.305046
floors	0.272151
view	0.196460
has_basement	0.152591
waterfront	0.073671
yr_built	0.053012
condition	0.044926
"""

In [None]:
abs(housing.corr()[['condition']]).sort_values(by='condition', ascending=False)

In [None]:
housing.corr()[['price']].sort_values(by='price', ascending=False)

## How to identify correlations with a heatmap

In [None]:
sns.heatmap(data=housing.corr(), cmap='viridis')

In [None]:
sns.heatmap(data=housing.corr()[['price']].sort_values(by='price', ascending=False), 
            annot=True, cmap='Blues', cbar=False, fmt=f'.2f')

## How to create, validate, and use a linear regression model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
#r-squared = coeficient of determination
0.61**2

In [None]:
list(housing.columns)

In [None]:
housing.price.quantile(0.95)

In [None]:
train_test_split?

In [None]:
housing.shape

In [None]:
# ML CODE LINE 1: split the data
# x - features/influencer/predictors/dependent variables
# y - target/response/label/independent variables
housing_sample = housing.query("price < 1184050")
x_train, x_test, y_train, y_test = train_test_split(
    # Creating training features = "X"
    housing_sample[[
 'bedrooms',
 'bathrooms',
 'sqft_living'
 ,
 'floors',
 'waterfront',
 'view',
 'condition',
 'sqft_above',
 'yr_built',
 'has_basement'
]],
# creating  training target = Y
 housing_sample[['price']], 
# x_train, x_test, y_train, y_test = train_test_split(
#     housing_sample[[ 'sqft_living']], housing_sample[['price']], 
     test_size=0.33, random_state=1234)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
x_train.plot.kde()

In [None]:
x_test.plot.kde()

In [None]:

# create the model from the training dataset
linearModel = LinearRegression() # ML CODE LINE 2:  create a  class of the type of the algorithm you'll use
linearModel.fit(x_train, y_train) # ML CODE LINE 3: creates a linear regression model

# 2 var - 0.06796236042762027
# 1 var - 0.06848429889803354
# all var - 0.07475527718481156

In [None]:
y_pred =linearModel.predict(x_train) # ML CODE LINE 4: use the model to generate predictions 
# validate the model with the test dataset


In [None]:
#ML CODE LINE 5: getting a metric for the performance of the model
from sklearn.metrics import r2_score
price_r2 = r2_score(y_train.to_numpy(), y_pred)
print("{} R squared is {}% different than predicted square on TRAINING DATA of R".format(round(price_r2,4),round(100 *(0.3721 - price_r2)/0.3721,2))) # comparing the sauare of the r (correlation) to our model's r-squared
# one variable 0.3693 R squared 

In [None]:
#ML CODE LINE 4a: getting a metric for the performance of the model - test data
#ML CODE LINE 5a: getting a metric for the performance of the model - test data
y_pred_test =linearModel.predict(x_test)
price_r2_test = r2_score(y_test.to_numpy(), y_pred_test)
print("{} R squared is {}% different than predicted square on TEST DATA of R".format(round(price_r2_test,4),round(100 *(0.3721 - price_r2)/0.3721,2))) # comparing the sauare of the r (correlation) to our model's r-squared

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test.to_numpy(), y_pred_test)
# 33.48% MAPE

In [None]:
y_test

In [None]:
print(linearModel.coef_ )
print(linearModel.intercept_)

In [None]:
(0.07475527718481156 - 0.06848429889803354)/0.06848429889803354

In [None]:
# use the model to make predictions
y_predicted = linearModel.predict(x_test)
y_predicted

In [None]:
# y = mx + b or target = coefficient*input + intercept
print( "price = {0} * sqft_living+ {1}".format(str(linearModel.coef_[0][0]),str(linearModel.intercept_[0])))

In [None]:
housing["price_linreg"] = 156.08006337157076  * housing["sqft_living"] + 162066.9462386825

In [None]:
housing["price_predict"] = linearModel.predict(housing[['sqft_living']])

In [None]:
housing.plot.scatter( x= 'price_predict', y = "price_linreg")

In [None]:
housing.head(5)

In [None]:
melted = pd.melt(housing, id_vars=['sqft_living'], 
                 value_vars=['price','price_predict'], 
                 var_name='price_type', value_name='price_value')
melted

## How to plot the predicted data

In [None]:
x_test.reset_index(drop=True)

In [None]:
# put predicted values in a frame
predicted = pd.DataFrame(y_predicted, columns=['price_predicted'])

# combine the test data and the predicted data
combined = predicted.join([x_test.reset_index(drop=True),
                           y_test.reset_index(drop=True)])

# melt price and price_predicted columns into a single column
melted = pd.melt(combined, id_vars=['sqft_living'], 
                 value_vars=['price','price_predicted'], 
                 var_name='price_type', value_name='price_value')

melted.head()

In [None]:
sns.relplot(data=melted, x='sqft_living', y='price_value', 
            hue='price_type')

## How to plot the residuals

In [None]:
combined['residual'] = combined.price - combined.price_predicted
combined.head()

In [None]:
combined['abs_pct_err'] =  abs(combined.residual / combined.price_predicted)

In [None]:
combined.abs_pct_err.describe()

In [None]:
combined.residual.max()

In [None]:
g = sns.relplot(data=combined, x='sqft_living', y='residual')

# draw a horizontal line where the Y axis is 0
for ax in g.axes.flat:    
    ax.axhline(0, ls='--',color = "red")

In [None]:
combined.sqft_living.plot.hist(bins=20)

In [None]:
combined["sqft_living_bin"] = pd.qcut(combined.sqft_living, q=20)
sns.catplot(data = combined , kind='box', y="sqft_living_bin",x="residual" , orient='h')

In [None]:
combined["sqft_above_bin"] = pd.qcut(combined.sqft_above, q=20)
sns.catplot(data = combined , kind='box', y="sqft_above_bin",x="residual" , orient='h')

## How to plot a linear regression

In [None]:
sns.lmplot(data=housing, x='sqft_living', y='price', ci=None,
           scatter_kws={'s':5}, line_kws={'color':'red'})

In [None]:
!pip install statsmodels

## How to plot a logistic regression

In [None]:
housing.price.hist(bins=20)

In [None]:
mort_dist = housing[['price']].quantile([i/20 for i in range(0,20)])
mort_dist

mort_dist["quantile"] = mort_dist.index
mort_dist

In [None]:
mort_dist.plot.line(x="price",y="quantile")

In [None]:
sns.lmplot(data=housing, x='price', y='has_basement', ci=None,
           scatter_kws={'s':1}, line_kws={'color':'red'}, 
           logistic=True)

## How to plot a polynormal regression

In [None]:
# get the data
mortality_data = pd.read_pickle('mortality_data.pkl')
mortality_data.head()

In [None]:
sns.lmplot(x='Year', y='DeathRate', hue='AgeGroup', data=mortality_data, ci=None,
           markers='d', x_bins=20, order=3)

## How to plot a lowess regression

In [None]:
sns.lmplot(data=mortality_data, x='Year', y='DeathRate', hue='AgeGroup', 
           ci=None, markers='d', x_bins=20, lowess=True)

## How to plot residuals with Seaborn

In [None]:
sns.residplot(data=housing, x='sqft_living', y='price', 
              scatter_kws={'s':5})

In [None]:
sns.residplot(data=mortality_data, x='Year', y='DeathRate', 
              order=3, scatter_kws={'s':5})

In [4]:
import os
import pandas as pd
cwd = os.getcwd()
dataset = pd.DataFrame("path":cwd)



'c:\\2023-02-DSI-WE\\instructor\\ml'

In [6]:
import os
import pandas as pd
cwd = os.getcwd()
dataset = pd.DataFrame({"path":[cwd]})
dataset

Unnamed: 0,path
0,c:\2023-02-DSI-WE\instructor\ml


In [7]:
dataset.to_csv('export.csv')

In [8]:
2 **64

18446744073709551616