# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Read Data

In [2]:
odf = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

# Correlation Map

In [22]:
cor_map = odf.corr()
plt.figure(figsize=(18,18))
sns.heatmap(cor_map, annot=True)

# Data Preprocessing
1. Removing "id" column because it does not interact with dependant variable
2. Replacing "date" column to numerical values (year, month and day)
3. Separating dependant variable from dataset

In [4]:
df = odf.drop(['id'], axis = 1)

In [5]:

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df = df.drop("date", axis = 1)

In [6]:
test = df['price']
df = df.drop(['price'], axis = 1)

# Spliting Data

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df, test, test_size = 0.2, random_state = 42)

# Training And Testing

# 1. Linear Regression

In [17]:
model = LinearRegression()
model.fit(x_train, y_train)

In [18]:
print("Training accuracy: {}%".format(model.score(x_train, y_train) * 100))
print("Test accuracy: {}%".format(model.score(x_test, y_test) * 100))

# 2. Polynomial Regression

In [25]:
steps = [
    ("poly", PolynomialFeatures(degree = 2)),
    ("model", LinearRegression())
]

pipe = Pipeline(steps)

pipe.fit(x_train, y_train)

In [26]:
print("Training accuracy: {}%".format(pipe.score(x_train, y_train) * 100))
print("Test accuracy: {}%".format(pipe.score(x_test, y_test) * 100))