In [1]:
import pandas as pd

## [Problem 1]: Feature selection for practice

In [2]:
data = pd.read_csv("train.csv")
data.loc[:,["GrLivArea","YearBuilt"]]
# data

Unnamed: 0,GrLivArea,YearBuilt
0,1710,2003
1,1262,1976
2,1786,2001
3,1717,1915
4,2198,2000
...,...,...
1455,1647,1999
1456,2073,1978
1457,2340,1941
1458,1078,1950


In [3]:
data.loc[:,"SalePrice"]

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

## [Problem 2]: Estimation and evaluation by linear regression

In [5]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# import seaborn as sns
# import matplotlib.pyplot as plt
import plotly.express as px

In [6]:
my_X = data.loc[:,["GrLivArea","YearBuilt"]]
my_Y =data.loc[:,"SalePrice"]
X_train, X_test, y_train, y_test = train_test_split( np.array(my_X), np.array(my_Y), test_size=0.25, random_state=42)

In [7]:
reg = LinearRegression().fit(X_train[:,0].reshape(-1, 1), y_train)

In [8]:
y_predict = reg.predict(X_test[:,0].reshape(-1, 1))

In [9]:
mean_squared_error(y_test,y_predict)

3118447791.9869394

In [29]:
# X_test[:,0|
import plotly.graph_objects as go

In [31]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict,
    mode="lines",
    name="Predict",
))

## [Problem 3]: Comparison of methods

In [32]:
reg = LinearRegression().fit(X_train[:,0].reshape(-1, 1), y_train)
y_predict = reg.predict(X_test[:,0].reshape(-1, 1))
print("MSE:",mean_squared_error(y_test,y_predict))
ln1 = mean_squared_error(y_test,y_predict)
# plt.scatter(X_test[:,0], y_test, c="Red",label="True Price")
# plt.plot(X_test[:,0], y_predict, c="Blue",label="Predict")
# plt.xlabel("Area")
# plt.ylabel("Price")
# plt.figlegend()
# plt.show()

MSE: 3118447791.9869394


In [33]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict,
    mode="lines",
    name="Predict",
))

In [34]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [35]:
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train[:,0].reshape(-1, 1), y_train)
y_predict_svm = regr.predict(X_test[:,0].reshape(-1, 1))
print("MSE:",mean_squared_error(y_test,y_predict_svm))
svm1 = mean_squared_error(y_test,y_predict_svm)

MSE: 7171137498.473831


In [36]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_svm,
    mode="lines",
    name="Predict",
))

In [37]:
from sklearn.tree import DecisionTreeRegressor

In [38]:
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train[:,0].reshape(-1, 1), y_train)
y_predict_dt = regressor.predict(X_test[:,0].reshape(-1, 1))
print("MSE:",mean_squared_error(y_test,y_predict_dt))
dt1 = mean_squared_error(y_test,y_predict_dt)

MSE: 4595868138.839861


In [40]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_dt,
    mode="markers",
    name="Predict",
))

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [42]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train[:,0].reshape(-1, 1), y_train)
y_predict_rf = regr.predict(X_test[:,0].reshape(-1, 1))
print("MSE:",mean_squared_error(y_test,y_predict_rf))
rf1 = mean_squared_error(y_test,y_predict_rf)

MSE: 3419859358.226372


In [43]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_rf,
    mode="markers",
    name="Predict",
))

## Using YearBuilt field

In [61]:
my_X = data.loc[:,["GrLivArea","YearBuilt"]]
my_Y =data.loc[:,"SalePrice"]
X_train, X_test, y_train, y_test = train_test_split( np.array(my_X), np.array(my_Y), test_size=0.25, random_state=42)

In [70]:
reg = LinearRegression().fit(X_train[:,1].reshape(-1, 1), y_train)
y_predict = reg.predict(X_test[:,1].reshape(-1, 1))
lr2 = mean_squared_error(y_test,y_predict)
print("MSE:",lr2)

MSE: 5037617329.528072


In [71]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict,
    mode="lines",
    name="Predict",
))
fig.update_layout(

    xaxis_title="Area",
    yaxis_title="Price",
    legend_title="Notation",

)

In [72]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [73]:
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train[:,1].reshape(-1, 1), y_train)
y_predict_svm = regr.predict(X_test[:,1].reshape(-1, 1))
print("MSE:",mean_squared_error(y_test,y_predict_svm))
svm2 = mean_squared_error(y_test,y_predict_svm)

MSE: 7166030188.017618


In [74]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_svm,
    mode="lines",
    name="Predict",
))
fig.update_layout(

    xaxis_title="Area",
    yaxis_title="Price",
    legend_title="Notation",

)

In [75]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
y_predict_dt = regressor.predict(X_test)
print("MSE:",mean_squared_error(y_test,y_predict_dt))
dt2 = mean_squared_error(y_test,y_predict_dt)


MSE: 2321866987.737291


In [77]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_dt,
    mode="markers",
    name="Predict",
))
fig.update_layout(

    xaxis_title="Area",
    yaxis_title="Price",
    legend_title="Notation",

)

In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
y_predict_rf = regr.predict(X_test)
print("MSE:",mean_squared_error(y_test,y_predict_rf))
rf2 = mean_squared_error(y_test,y_predict_rf)

MSE: 2806352490.9609694


In [82]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict_rf,
    mode="markers",
    name="Predict",
))
fig.update_layout(

    xaxis_title="Area",
    yaxis_title="Price",
    legend_title="Notation",

)

In [83]:
field1 = {'Linear Regression': ln1,
                   'SVM': svm1,
                   'Decision Tree': dt1,
                   'Random Forest': rf1,
                   }
GrLivArea = pd.Series(field1)
field2 = {'Linear Regression': lr2,
                   'SVM': svm2,
                   'Decision Tree': dt2,
                   'Random Forest': rf2,
                   }
YearBuilt = pd.Series(field2)
compare = pd.DataFrame({'GrLivArea': GrLivArea,
                       'YearBuilt': YearBuilt})
compare

Unnamed: 0,GrLivArea,YearBuilt
Linear Regression,3118448000.0,5037617000.0
SVM,7171137000.0,7166030000.0
Decision Tree,4595868000.0,2321867000.0
Random Forest,3419859000.0,2806352000.0


## [Problem 4]: (Advance assignment) Learning using other features

In [84]:
X = data.loc[:,["LotArea"]]
Y =data.loc[:,"SalePrice"]

In [85]:
X

Unnamed: 0,LotArea
0,8450
1,9600
2,11250
3,9550
4,14260
...,...
1455,7917
1456,13175
1457,9042
1458,9717


In [86]:
Y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [87]:
fig = px.violin(X, x ="LotArea")
fig.show()

In [88]:
fig = px.box(X, x ="LotArea")
fig.show()

In [89]:

X_train, X_test, y_train, y_test = train_test_split( np.array(X), np.array(Y), test_size=0.25, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
y_predict = reg.predict(X_test)

In [91]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_test,
    mode="markers",
    name="True Price"
))
fig.add_trace(go.Scatter(
    x=X_test[:,0],
    y=y_predict,
    mode="lines",
    name="Predict",
))
fig.update_layout(

    xaxis_title="Area",
    yaxis_title="Price",
    legend_title="Notation",

)