In [1]:
import warnings
warnings.filterwarnings(action='ignore') 

import datetime
import pandas as pd
import numpy as np
import csv
import folium
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy as sp
import statsmodels.formula.api as smf
plt.rc('font',family='D2CodingLigature Nerd Font')
# plt.rcParams['axes.unicode_minus']=False  # '- 표시

|속성이름|유형|설명|
|:---|:---|:---|
|mpg|연속형|연비(Miles Per Gallon) - 예측하려는 목표 변수 (Target)|
|cylinders|이산형|엔진의 실린더 수|
|displacement|연속형|배기량 (cubic inches)|
|horsepower|연속형|마력 (Horsepower)|
|weight|연속형|차량 무게 (pounds)|
|acceleration|연속형|제로백 (0에서 60mph까지 가속 시간)|
|model year|이산형|모델 연도 (70부터 82까지)|
|origin|이산형|제조국가 (1: 미국, 2: 유럽, 3: 일본)|
|car name|문자열|자동차 모델 이름 (고유값)|

In [8]:
data_df = pd.read_csv('../../data/auto-mpg.csv')
data_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [9]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [10]:
data_df = data_df.drop(['car_name', 'origin', 'horsepower'], axis=1, inplace=False)

In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   weight        398 non-null    int64  
 4   acceleration  398 non-null    float64
 5   model_year    398 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 18.8 KB


In [None]:
model_mlm = smf.ols(formula = 'mpg ~ cylinders + displacement + weight + acceleration + model_year', data = data_df)

result_mlm = model_mlm.fit()

result_mlm.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.806
Method:,Least Squares,F-statistic:,331.4
Date:,"Tue, 09 Dec 2025",Prob (F-statistic):,2.5299999999999998e-138
Time:,20:48:19,Log-Likelihood:,-1053.5
No. Observations:,398,AIC:,2119.0
Df Residuals:,392,BIC:,2143.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-14.5697,4.138,-3.521,0.000,-22.705,-6.434
cylinders,-0.2586,0.329,-0.787,0.432,-0.905,0.387
displacement,0.0073,0.007,1.017,0.310,-0.007,0.021
weight,-0.0069,0.001,-11.614,0.000,-0.008,-0.006
acceleration,0.0803,0.078,1.025,0.306,-0.074,0.234
model_year,0.7553,0.051,14.875,0.000,0.655,0.855

0,1,2,3
Omnibus:,37.177,Durbin-Watson:,1.215
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.506
Skew:,0.625,Prob(JB):,3.26e-13
Kurtosis:,4.381,Cond. No.,74300.0


## LinearRegression

In [14]:
x = data_df[["cylinders", "displacement", "weight", "acceleration", "model_year"]]
y = data_df["mpg"] 
model = LinearRegression()
model.fit(x, y)
print("Intercept (절편):", model.intercept_)
print("Coefficients (회귀 계수):", model.coef_)

Intercept (절편): -14.569690476020249
Coefficients (회귀 계수): [-0.25858516  0.00726771 -0.00692571  0.08034746  0.75530084]


#### 데이터를 분할하여 분석

In [None]:
y = data_df['mpg']      # 종속변수
x = data_df.drop(['mpg'], axis=1, inplace=False)    # 독립변수

In [15]:
from sklearn.model_selection import train_test_split

# 훈련용 데이터(70%)와 평가용 데이터(30%) 분할한다.
x_train, x_test, y_train, y_test = train_test_split(x, y,  test_size=0.3,  random_state=0)

In [16]:
x_train

Unnamed: 0,cylinders,displacement,weight,acceleration,model_year
230,8,350.0,4165,11.4,77
357,4,119.0,2615,14.8,81
140,8,304.0,4257,15.5,74
22,4,104.0,2375,17.5,70
250,8,318.0,3735,13.2,78
...,...,...,...,...,...
323,4,156.0,2800,14.4,80
192,6,250.0,3353,14.5,76
117,4,68.0,1867,19.5,73
47,6,250.0,3282,15.0,71


In [17]:
# 선형회귀분석 : 모델 생성  (LinearRegression() 사용)
lr = LinearRegression()

# 선형회귀분석 : 모델 학습
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [18]:
# 선형회귀분석 : 평가 데이터에 대한 예측 수행 -> 예측 결과 Y_predict 구하기
y_predict = lr.predict(x_test)
y_predict

array([12.68813886, 24.89235692, 11.77966226, 20.713484  , 17.59974974,
       28.70463932, 31.8246109 , 24.79459783, 13.3797074 , 28.03809202,
       33.95480562, 32.517138  , 21.28643713, 26.32547953, 16.29305741,
       32.16905355, 30.04731897, 29.90617451, 17.78110613, 30.69577551,
       15.33574367, 25.60299563, 25.4723533 , 20.04730443, 30.70244174,
       26.79576507, 32.20306918, 31.95533215, 30.42830733, 18.19432511,
       21.19981862, 29.62938625, 20.86352443, 30.82503708, 24.35511863,
       23.81034331, 21.45816755, 16.77101118, 31.95362794,  8.06590693,
        9.60178767, 13.8699291 , 27.26165312, 29.00102026, 29.52364892,
       22.58571769, 23.20233694, 13.80533693, 23.86039376, 28.12535283,
       31.40954528, 26.70297991, 15.34381889, 25.01897062, 14.71481708,
        9.53883961, 19.64272577, 26.82491432, 31.30917854, 14.99268593,
       20.64360774, 25.57610711, 22.45200894, 19.72033566, 10.7918702 ,
       11.42513414,  9.7023088 , 19.68724093, 24.98227534,  9.68

In [19]:
y_test

65     14.0
132    25.0
74     13.0
78     21.0
37     18.0
       ... 
236    25.5
352    29.9
92     13.0
221    17.5
322    46.6
Name: mpg, Length: 120, dtype: float64

In [20]:
# 회귀계수와 절편 출력
print("회귀계수 (coef):", lr.coef_)
print("절편 (intercept):", lr.intercept_)

# R² 점수
r2 = r2_score(y_test, y_predict)
print(f"결정계수 (R²): {r2:.4f}")

회귀계수 (coef): [-0.13707609  0.00748253 -0.00688522  0.19807649  0.7577852 ]
절편 (intercept): -17.54805722395163
결정계수 (R²): 0.8079


#### 회귀분석을 응용하여 예측

In [22]:
print("연비를 예측하고 싶은 차의 정보를 입력해주세요.")

cylinders_1 = int(input("cylinders : "))
displacement_1 = int(input("displacement : "))
weight_1 = int(input("weight : "))
acceleration_1 = float(input("acceleration : "))
model_year_1 = int(input("model_year : "))

mpg_predict = lr.predict([[cylinders_1, displacement_1, weight_1, acceleration_1, model_year_1]])

# 8, 300, 3500, 12, 72
print(f"이 자동차의 예상 연비(mpg)는 {mpg_predict[0]:.2f} 입니다.")

연비를 예측하고 싶은 차의 정보를 입력해주세요.
이 자동차의 예상 연비(mpg)는 16.44 입니다.
