# ・米国の中古車の価格について  

### 各車の中古車価格の関係性を分析していきます。
参考:各車のメーカー、モデル、年式、走行距離、価格、色、州、および日付がランダムに含まれていて、
現在の日付と 1 年後の間で予測データが生成されています(https://www.kaggle.com/datasets/at3191/us-car-prices?resource=download)

In [1]:
# 必要なライブラリのインポート
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

%matplotlib inline
sns.set()

## 1.データの読み込み、表示

In [2]:
data = pd.read_csv('dataset/car_prices.csv')
data.head()

Unnamed: 0,make,model,year,mileage,price,color,state,date
0,Chevrolet,Cruze,2011,99157,37488,white,TX,2023-09-26 23:02:01.647970
1,Toyota,Cruze,2017,19882,34176,red,IL,2023-08-08 23:02:01.648413
2,Chevrolet,Altima,2011,77718,40297,blue,CA,2023-02-25 23:02:01.648422
3,Nissan,Fusion,2012,83613,43492,silver,TX,2023-04-25 23:02:01.648425
4,Ford,Cruze,2016,95349,26655,red,NY,2023-12-14 23:02:01.648426


## 2.**make、color、全体**の平均価格

In [17]:
# make別の平均
me_make_T = np.mean(data.query('make == "Toyota"')['price'])
me_make_N = np.mean(data.query('make == "Nissan"')['price'])
me_make_C = np.mean(data.query('make == "Chevrolet"')['price'])
me_make_F = np.mean(data.query('make == "Ford"')['price'])
me_make_H = np.mean(data.query('make == "Honda"')['price'])
print('make T:', me_make_T)
print('make N:', me_make_N)
print('make C:', me_make_C)
print('make F:', me_make_F)
print('make H:', me_make_H)

make T: 29944.18446601942
make N: 29580.183035714286
make C: 28676.253588516745
make F: 29929.14438502674
make H: 28687.77011494253


In [18]:
# color別の平均
me_color_w = np.mean(data.query('color == "white"')['price'])
me_color_r = np.mean(data.query('color == "red"')['price'])
me_color_bu = np.mean(data.query('color == "blue"')['price'])
me_color_s = np.mean(data.query('color == "silver"')['price'])
me_color_bl = np.mean(data.query('color == "black"')['price'])
print('color w:', me_color_w)
print('color r:', me_color_r)
print('color bu:', me_color_bu)
print('color s:', me_color_s)
print('color bl:', me_color_bl)

color w: 29080.9587628866
color r: 30145.058536585366
color bu: 29007.671875
color s: 29693.963470319635
color bl: 28854.35789473684


In [19]:
# 全体の平均
mu_all = np.mean(data['price'])
mu_all

29376.222

## 3.**make、color**のデータ数の確認

In [20]:
# make別のデータ数の確認
data_T = data.query('make == "Toyota"')['price']
print('data_T:',data_T.count())
data_N = data.query('make == "Nissan"')['price']
print('data_N:',data_N.count())
data_C = data.query('make == "Chevrolet"')['price']
print('data_C:',data_C.count())
data_F = data.query('make == "Ford"')['price']
print('data_F:',data_F.count())
data_H = data.query('make == "Honda"')['price']
print('data_H:',data_H.count())

data_T: 2060
data_N: 2240
data_C: 2090
data_F: 1870
data_H: 1740


In [7]:
# color別のデータ数の確認
data_w = data.query('color == "white"')['price']
print('data_w:',data_w.count())
data_r = data.query('color == "red"')['price']
print('data_r:',data_r.count())
data_bu = data.query('color == "blue"')['price']
print('data_bu:',data_bu.count())
data_s = data.query('color == "silver"')['price']
print('data_s:',data_s.count())
data_bl = data.query('color == "black"')['price']
print('data_bl:',data_bl.count())

data_w: 1940
data_r: 2050
data_bu: 1920
data_s: 2190
data_bl: 1900


## 4.**全体、make、color、誤差**の平方和を求める

In [8]:
# 全体の平方和
squares_all2 = np.sum((data['price']-mu_all)**2)
squares_all2

1359504242087.1602

In [9]:
# makeの平方和
make = [me_make_T]*2060 + [me_make_N]*2240 + [me_make_C]*2090 + [me_make_F]*1870 + [me_make_H]*1740
make = np.array(make)

squares_make = np.sum((make-mu_all)**2)
squares_make

3178112567.6416283

In [10]:
# colorの平方和
color = [me_color_w]*1940 + [me_color_r]*2050 + [me_color_bu]*1920 + [me_color_s]*2190 + [me_color_bl]*1900
color = np.array(color)

squares_color = np.sum((color-mu_all)**2)
squares_color

2380248400.492907

In [11]:
# 誤差の平方和
squares_resid2 = squares_all2 - squares_make - squares_color
squares_resid2

1353945881119.0256

## 5.各種の**自由度**を求める

In [12]:
# makeの自由度
df_make = 4
# colorの自由度
df_color = 4
# 誤差の自由度
df_resid = 9991

## 6.**make、color、誤差**の分散を求める

In [13]:
# makeの分散
variance_make = squares_make/df_make
variance_make

794528141.9104071

In [14]:
# colorの分散
variance_color = squares_color/df_color
variance_color

595062100.1232268

In [15]:
# 誤差の分散
variance_resid2 = squares_resid2/df_resid
variance_resid2

135516553.00961122

## 7.**make、color**のF比を求める

In [16]:
# makeのF比
f_ratio_make = variance_make/variance_resid2
f_ratio_make

5.862960090595404

In [17]:
# colorのF比
f_ratio_color = variance_color/variance_resid2
f_ratio_color

4.391065791652945

## 8.**make、color**のp値を求める

In [18]:
# makeのp値
1-sp.stats.f.cdf(x=f_ratio_make, dfn=df_make, dfd=df_resid)

0.0001040437771613334

In [19]:
# colorのp値
1-sp.stats.f.cdf(x=f_ratio_color, dfn=df_color, dfd=df_resid)

0.0015103098675539606

## 9.二元分析実施

In [20]:
car_model_1 = smf.ols('price ~ make + color', data=data).fit()
sm.stats.anova_lm(car_model_1, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
make,3599297000.0,4.0,6.642025,2.5e-05
color,2801433000.0,4.0,5.169672,0.00037
Residual,1353525000000.0,9991.0,,


# 10.分散分析表　　
## 計算結果に誤差は出ているもののmake、color共にp値は0.05よりも小さくなっているため。　　
## **中古車の価格はmake、colorによって有意に異なる**と判断できました。