# project title

## Setup

### Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
import warnings
warnings.filterwarnings('ignore')

### Load Data

In [None]:
path = r"..\dataset\king_ country_ houses_aa.csv"

data = pd.read_csv(path)

df = data.copy()

df.head()

### Column rename

In [None]:
# Print the column names
print(df.columns)

In [None]:
new_cols = []

# Loop through each column name of the DataFrame
for col in list(df.columns):
    new_col = col.strip().lower().replace(" ", "_")
    new_cols.append(new_col)

# Reassign the column names to DataFrame 
df.columns = new_cols

# Confirm changes
df.columns

In [None]:
description = {
    'id': 'A unique identifier for a house',
    'date': 'The date on which the house was sold',
    'price': 'The sale price of the house (prediction target)',
    'bedrooms': 'Number of bedrooms in the house',
    'bathrooms': 'Number of bathrooms in the house, per bedroom',
    'sqft_living': 'Square footage of the interior living space',
    'sqft_lot': 'Square footage of the land space',
    'floors': 'Number of floors (levels) in the house',
    'waterfront': 'Whether the house has a waterfront view',
    'view': 'Number of times the house has been viewed',
    'condition': 'The overall condition of the house',
    'grade': 'The overall grade given to the house, based on the King County grading system',
    'sqft_above': 'Square footage of the house apart from the basement',
    'sqft_basement': 'Square footage of the basement',
    'yr_built': 'The year the house was built',
    'yr_renovated': 'The year the house was renovated',
    'zipcode': 'ZIP code area',
    'lat': 'Latitude coordinate',
    'long': 'Longitude coordinate',
    'sqft_living15': 'The interior living space for the nearest 15 neighbors in 2015',
    'sqft_lot15': 'The land spaces for the nearest 15 neighbors in 2015'
}
def all_cols_descrition():
    return description

def col_description(col_name):
     print(description[col_name])

In [None]:
col_description('sqft_lot15')

## Data Overview

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.date = df.date.astype('object')
print(df.date.dtypes)

### Duplicate records

In [None]:
def duplicate_records(datafame):
    duplicate_count = datafame.duplicated().sum()

    if duplicate_count == 0:
        print(f"No duplicate records found: {duplicate_count}")
    else:
        print(f"The dataset has duplicate records: {duplicate_count}")

duplicate_records(df)

### Missing values

In [None]:
def missing_values(dataframe):
    missing_count = dataframe.isna().sum()
    total_missing_count = missing_count.sum()

    if total_missing_count == 0:
        print(f"No missing values found in the dataset")
        return  missing_count
    else:
        print(f"Missing values found in the dataset")
        return missing_count
    
missing_values(df)

In [None]:
df.describe()

Question: What we can remove for first baseline model?
- Since we checked our data already and we can donot have missing values and duplicate records.

- But we have ``id`` and ``date`` column which we can drop fro our first baseline model.

After baseline model
- Column ``bedrooms`` has max value of 33 Bedrooms. But  75% of the data has lies with 4 number of rooms.

- Treat columns  ``zipcode``, ``lat`` and ``long``. 

## Linear Regression

### Baseline Model (version_0.0)

- This baseline model is based on the 

In [None]:
df_baseline = df.drop(columns=['id', 'date'])
df_baseline.head()

In [None]:
df_baseline.shape

#### Split Dataset

In [None]:
X = df_baseline.drop(columns=['price']) 
y = df_baseline['price']

print(f"Total feature shape: {X.shape}")
print(f"Target feature shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=45)

print(f"X_train size: {X_train.shape}")
print(f"X_test size: {X_test.shape}")
print(f"y_train size: {y_train.shape}")
print(f"y_test size: {y_test.shape}")

#### Train Model

In [None]:
# Initilise the baseline model
lr = LinearRegression()

# Fit the data
lr_baseline_model = lr.fit(X_train, y_train)

# Predictions
y_pred_test = lr_baseline_model.predict(X_test)
y_pred_train = lr_baseline_model.predict(X_train)

print(len(y_pred_test))


#### Model Evaluation

In [None]:
# R2
r2_train_baseline = r2_score(y_train, y_pred_train)
r2_test_baseline = r2_score(y_test, y_pred_test)

# RMSE
rmse_train_baseline = root_mean_squared_error(y_train, y_pred_train)
rmse_test_baseline = root_mean_squared_error(y_test, y_pred_test)

print("R2 Score")
print(f"R2 score train: {r2_train_baseline:.2f}")
print(f"R2 score test: {r2_test_baseline:.2f}")

print("\nRMSE Score")
print(f"RMSE train : {rmse_train_baseline:.2f}")
print(f"RMSE test : {rmse_test_baseline:.2f}")

**Interpretation**

- Model is stable, no sign of ovrfitting. 

- Model explains 70%

### Scaled Model (version_0.1)
- We implement the scaling on the dataset that we have used to train the baseline model.

- After scaling we train the model again on the scaled dataset and observe any changes in model performance. 

In [None]:
def model_evaluation(ytrain, ytrainpred, ytest,  ytestpred, step):

    # R2
    r2_train = r2_score(ytrain, ytrainpred)
    r2_test = r2_score(ytest, ytestpred)

    # RMSE
    rmse_train = root_mean_squared_error(ytrain, ytrainpred)
    rmse_test = root_mean_squared_error(ytest, ytestpred)

    print(f"R2 Score {step}")
    print(f"R2 score train: {r2_train:.2f}")
    print(f"R2 score test: {r2_test:.2f}")

    print(f"\nRMSE Score {step}")
    print(f"RMSE train : {rmse_train:.2f}")
    print(f"RMSE test : {rmse_test:.2f}")

#### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

#### Train Model

In [None]:
# Fit model with scaled dataset
lr_baseline_model_scaled = lr.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_scaled = lr_baseline_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = lr_baseline_model_scaled.predict(X_test_scaled)

In [None]:
y_test

#### Model Evaluation

In [None]:
model_evaluation(y_train, y_train_pred_scaled, y_test, y_test_pred_scaled, 'scaled')