### **Multivariate Linear Regression Task**
- 구매자의 연수입(Annual Salary)과 순자산(자산 - 부채, Net Worth), 신용카드 부채(Credit Card Debt)를 통해 다변량 회귀 분석 진행

In [None]:
import pandas as pd

c_df = pd.read_csv('./datasets/car_purchasing.csv')

c_df

In [2]:
c_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Customer Name        500 non-null    object 
 1   Customer e-mail      500 non-null    object 
 2   Country              500 non-null    object 
 3   Gender               500 non-null    int64  
 4   Age                  500 non-null    int64  
 5   Annual Salary        500 non-null    float64
 6   Credit Card Debt     500 non-null    float64
 7   Net Worth            500 non-null    float64
 8   Car Purchase Amount  500 non-null    float64
dtypes: float64(4), int64(2), object(3)
memory usage: 35.3+ KB


In [3]:
columns = ['Annual Salary', 'Net Worth', 'Credit Card Debt', 'Car Purchase Amount']

pre_c_df = c_df[columns]
pre_c_df

Unnamed: 0,Annual Salary,Net Worth,Credit Card Debt,Car Purchase Amount
0,62812.09301,238961.2505,11609.380910,35321.45877
1,66646.89292,530973.9078,9572.957136,45115.52566
2,53798.55112,638467.1773,11160.355060,42925.70921
3,79370.03798,548599.0524,14426.164850,67422.36313
4,59729.15130,560304.0671,5358.712177,55915.46248
...,...,...,...,...
495,71942.40291,541670.1016,6995.902524,48901.44342
496,56039.49793,360419.0988,12301.456790,31491.41457
497,68888.77805,764531.3203,10611.606860,64147.28888
498,49811.99062,337826.6382,14013.034510,45442.15353


In [4]:
# 컬럼 이름 변경
pre_c_df.rename(columns={
    'Annual Salary': 'Salary',
    'Net Worth': 'Worth',
    'Credit Card Debt': 'Card',
    'Car Purchase Amount': 'Amount'
}, inplace=True)

pre_c_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_c_df.rename(columns={


Unnamed: 0,Salary,Worth,Card,Amount
0,62812.09301,238961.2505,11609.380910,35321.45877
1,66646.89292,530973.9078,9572.957136,45115.52566
2,53798.55112,638467.1773,11160.355060,42925.70921
3,79370.03798,548599.0524,14426.164850,67422.36313
4,59729.15130,560304.0671,5358.712177,55915.46248
...,...,...,...,...
495,71942.40291,541670.1016,6995.902524,48901.44342
496,56039.49793,360419.0988,12301.456790,31491.41457
497,68888.77805,764531.3203,10611.606860,64147.28888
498,49811.99062,337826.6382,14013.034510,45442.15353


In [None]:
pre_c_df.hist()

In [None]:
pre_c_df.corr()['Amount'].sort_values(ascending=False)[1:]

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].scatter(pre_c_df['Salary'], pre_c_df['Amount'])
axes[1].scatter(pre_c_df['Worth'], pre_c_df['Amount'])
axes[2].scatter(pre_c_df['Card'], pre_c_df['Amount'])
plt.show()

In [None]:
import torch
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train1 = torch.FloatTensor(X_train.Salary.values).view(-1, 1)
X_train2 = torch.FloatTensor(X_train.Worth.values).view(-1, 1)
X_train3 = torch.FloatTensor(X_train.Card.values).view(-1, 1)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test1 = torch.FloatTensor(X_test.Salary.values).view(-1, 1)
X_test2 = torch.FloatTensor(X_test.Worth.values).view(-1, 1)
X_test3 = torch.FloatTensor(X_test.Card.values).view(-1, 1)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

W1 = torch.zeros(1, requires_grad=True)
W2 = torch.zeros(1, requires_grad=True)
W3 = torch.zeros(1, requires_grad=True)
b = torch.zeros(1, requires_grad=True)

optimizer = SGD([W1, W2, W3, b], lr=1e-12)

epochs = 1000000

for epoch in range(1, epochs + 1):
    H = W1 * X_train1 + W2 * X_train2 + W3 * X_train3 + b
    loss = torch.mean((y_train - H) ** 2)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10000 == 0:
        print('{:4d}/{}: W1: {:.4f}, W2: {:.4f}, W3: {:.4f}, b: {:.5f}, loss: {:.4f}'\
              .format(epoch, epochs, W1.item(), W2.item(), W3.item(), b.item(), loss.item()))

10000/1000000: W1: 0.5078, W2: 0.0283, W3: 0.0480, b: 0.00000, loss: 45532488.0000
20000/1000000: W1: 0.5101, W2: 0.0283, W3: 0.0312, b: 0.00000, loss: 45503432.0000
30000/1000000: W1: 0.5119, W2: 0.0283, W3: 0.0186, b: -0.00000, loss: 45487012.0000
40000/1000000: W1: 0.5132, W2: 0.0283, W3: 0.0091, b: -0.00000, loss: 45477736.0000
50000/1000000: W1: 0.5142, W2: 0.0283, W3: 0.0019, b: -0.00001, loss: 45472476.0000
60000/1000000: W1: 0.5149, W2: 0.0284, W3: -0.0035, b: -0.00001, loss: 45469516.0000
70000/1000000: W1: 0.5155, W2: 0.0284, W3: -0.0075, b: -0.00001, loss: 45467836.0000
80000/1000000: W1: 0.5160, W2: 0.0284, W3: -0.0106, b: -0.00001, loss: 45466884.0000
90000/1000000: W1: 0.5163, W2: 0.0284, W3: -0.0128, b: -0.00001, loss: 45466344.0000
100000/1000000: W1: 0.5165, W2: 0.0284, W3: -0.0146, b: -0.00002, loss: 45466040.0000
110000/1000000: W1: 0.5167, W2: 0.0284, W3: -0.0158, b: -0.00002, loss: 45465872.0000
120000/1000000: W1: 0.5168, W2: 0.0284, W3: -0.0168, b: -0.00002, loss

In [21]:
b.item()

5.196813617658336e-06

In [None]:
# H = 0.0525 * X_test1 + 0.2234 * X_test2 + 0.0228 * X_test3 + 0.0106
# loss = torch.mean((y_test - H) ** 2)
# print(loss.item())

In [None]:
import torch
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_a_df.iloc[:, :-1], pre_a_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

W = torch.zeros((3, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

optimizer = SGD([W, b], lr=1e-5)

epochs = 1000

for epoch in range(1, epochs + 1):
    H = X_train.matmul(W) + b
    loss = torch.mean((y_train - H) ** 2)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print('{:4d}/{}: W1: {:.4f}, W2: {:.4f}, W3: {:.4f}, b: {:.4f}, loss: {:.4f}'\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), b.item(), loss.item()))

In [None]:
# H = 0.0525 * X_test1 + 0.2234 * X_test2 + 0.0228 * X_test3 + 0.0106
# loss = torch.mean((y_test - H) ** 2)
# print(loss.item())

In [None]:
import torch
from torch.nn import Linear
from torch.nn.functional import mse_loss
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_a_df.iloc[:, :-1], pre_a_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

l_r = Linear(3, 1)

optimizer = SGD(l_r.parameters(), lr=1e-5)

epochs = 1000

for epoch in range(1, epochs + 1):
    H = l_r(X_train)
    loss = mse_loss(y_train, H)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print('{:4d}/{}: '\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), b.item(), loss.item()), end='')
        for i, w in enumerate(list(l_r.parameters())[0][0]):
            print('W{}: {:.4f}, '\
                  .format(i + 1, w.item()), end='')
        print('b: {:.4f}, loss: {:.4f}'.format(list(l_r.parameters())[1].item(), loss.item()))

In [None]:
# H = 0.0527 * X_test1 + 0.2228 * X_test2 + 0.0236 * X_test3 + -0.0221
# loss = torch.mean((y_test - H) ** 2)
# print(loss.item())

In [None]:
from torch.nn import Module, Linear

class LinearRegressionModel(Module):
    def __init__(self):
        super().__init__()
        self.linear = Linear(3, 1)

    def forward(self, x):
        return self.linear(x)

In [None]:
import torch
from torch.nn.functional import mse_loss
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_a_df.iloc[:, :-1], pre_a_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

l_r = LinearRegressionModel()

optimizer = SGD(l_r.parameters(), lr=1e-5)

epochs = 1000

for epoch in range(1, epochs + 1):
    H = l_r(X_train)
    loss = mse_loss(y_train, H)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print('{:4d}/{}: '\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), b.item(), loss.item()), end='')
        for i, w in enumerate(list(l_r.parameters())[0][0]):
            print('W{}: {:.4f}, '\
                  .format(i + 1, w.item()), end='')
        print('b: {:.4f}, loss: {:.4f}'.format(list(l_r.parameters())[1].item(), loss.item()))

In [None]:
# H = 0.0527 * X_test1 + 0.2228 * X_test2 + 0.0236 * X_test3 + -0.0221
# loss = torch.mean((y_test - H) ** 2)
# print(loss.item())

In [5]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

l_r = LinearRegression()
l_r.fit(X_train, y_train)
print('W: {:.4f}, b: {:.4f}'.format(l_r.coef_[0], l_r.intercept_))

W: 0.5646, b: -4533.4234


In [6]:
from sklearn.metrics import mean_squared_error

prediction = l_r.predict(X_test)
print('MSE loss: {:.4f}, RMSE loss: {:.4f}'\
      .format(mean_squared_error(y_test, prediction), 
      np.sqrt(mean_squared_error(y_test, prediction))))

MSE loss: 44871459.9118, RMSE loss: 6698.6163


In [None]:
import matplotlib.pyplot as plt

plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

plt.scatter(X_train, y_train)
plt.plot(X_train, 0.5646 * X_train + -4533.4234, color="red")
plt.grid(visible=True, linestyle='--')
plt.show()