<a href="https://colab.research.google.com/github/kessingtonosazee/GCP_Project_1/blob/master/Linear_Regression1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Predicting Price with Neighnourhood

# Libraries
import warnings
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wqet_grader
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)
wqet_grader.init("Project 2 Assessment")

# Prepare Data
# Create a wrangle function to import and prepare data:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # split "place_with_parent_names" column to create "neighborhood" column and drop the former
    df["neighborhood"]=df["place_with_parent_names"].str.split("|",expand=True)[3]
    df.drop(columns="place_with_parent_names", inplace=True)

    return df

#Assemble a list of file path names
files = glob("data/buenos-aires-real-estate-*.csv")
files

# Create a for loop to create a list named frames
frames = []
for file in files:
    df=wrangle(file)
    frames.append(df)

#Concatenate two or more DataFrames using pandas.
df = pd.concat(frames,ignore_index=True)
df.head()

# Create feature matrix and target vector
target = "price_aprox_usd"
features = ["neighborhood"]
y_train = df[target]
X_train = df[features]

#instantiate transformer
ohe = OneHotEncoder(use_cat_names=True)

# fit transformer
ohe.fit(X_train)

# transform Training Data
XT_train = ohe.transform(X_train)

print(XT_train.shape)
XT_train.head()

#Create a pipeline in scikit learn
#instantiate pipeline
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    #LinearRegression()
    Ridge()
)

#fit model to training data
model.fit(X_train,y_train)


# Evaluate model performance by Calculate the mean absolute error for a list of predictions in scikit-learn.
y_pred_training = model.predict(X_train)
y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

Communicate Result

# Access an object in pipeline in scikit learn
intercept = model.named_steps["ridge"].intercept_
coefficients = model.named_steps["ridge"].coef_

print("coefficients len:", len(coefficients))
print(coefficients[:5])  # First five coefficients

# Access features names
feature_names = model.named_steps["onehotencoder"].get_feature_names()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

# create a series in pandas
feat_imp = pd.Series(coefficients, index=feature_names)
feat_imp.head()

# Print out using f-string
print(f"price = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")

# Display in bar chart
feat_imp.sort_values(key=abs).tail(15).plot(kind="barh")
plt.xlabel("importance [USD]")
plt.ylabel("Feature")
plt.title("Feature Importance for Apartment Price")
