In [1]:
import os
import joblib
import numpy as np
import pandas as pd

In [2]:
from scipy import stats
import pingouin as pg

In [3]:
%precision 3
pd.set_option('display.float_format', lambda x: f'{x:.3f}')

In [4]:
os.getcwd()

'C:\\Users\\hdsce\\Documents\\PythonDataAnalysis2\\live'

In [5]:
os.chdir('../data')

In [6]:
os.listdir()

['Used_Cars_Price.csv', 'Used_Cars_Price.xlsx', 'Used_Cars_Price.z']

In [7]:
df = joblib.load(filename = 'Used_Cars_Price.z')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1268 entries, 0 to 1267
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1268 non-null   int64 
 1   Age        1268 non-null   int64 
 2   KM         1268 non-null   int64 
 3   FuelType   1268 non-null   object
 4   HP         1268 non-null   int64 
 5   MetColor   1268 non-null   object
 6   Automatic  1268 non-null   object
 7   CC         1268 non-null   int64 
 8   Doors      1268 non-null   int64 
 9   Weight     1268 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 99.2+ KB


In [9]:
df['Age'].cov(df['Price'])

-22157.693

In [11]:
df.cov(numeric_only = True)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
Price,4117236.37,-22157.693,-37441074.774,5899.686,18808.748,317.011,15155.222
Age,-22157.693,187.942,171187.124,-7.99,-213.607,-1.181,-98.226
KM,-37441074.774,171187.124,1285819949.253,-155996.12,2557934.806,595.873,359258.458
HP,5899.686,-7.99,-155996.12,171.608,-46.686,1.497,-39.091
CC,18808.748,-213.607,2557934.806,-46.686,33729.999,23.854,5001.293
Doors,317.011,-1.181,595.873,1.497,23.854,0.9,13.211
Weight,15155.222,-98.226,359258.458,-39.091,5001.293,13.211,1145.079


In [12]:
df['Age'].corr(df['Price'])

-0.797

In [13]:
df.corr(numeric_only = True)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
Price,1.0,-0.797,-0.515,0.222,0.05,0.165,0.221
Age,-0.797,1.0,0.348,-0.044,-0.085,-0.091,-0.212
KM,-0.515,0.348,1.0,-0.332,0.388,0.018,0.296
HP,0.222,-0.044,-0.332,1.0,-0.019,0.12,-0.088
CC,0.05,-0.085,0.388,-0.019,1.0,0.137,0.805
Doors,0.165,-0.091,0.018,0.12,0.137,1.0,0.412
Weight,0.221,-0.212,0.296,-0.088,0.805,0.412,1.0


In [14]:
pg.corr(x = df['Age'], y = df['Price'])

Unnamed: 0,n,r,CI95%,p-val,BF10,power
pearson,1268,-0.797,"[-0.82, -0.78]",0.0,6.1689999999999996e+274,1.0


In [15]:
pg.corr(x = df['Age'], y = df['Price'])['p-val']

pearson   0.000
Name: p-val, dtype: float64

In [16]:
corr = lambda x: pg.corr(x = x, y = df['Price'])['p-val']

In [17]:
corr(x = df['Age'])

pearson   0.000
Name: p-val, dtype: float64

In [18]:
df.dtypes

Price         int64
Age           int64
KM            int64
FuelType     object
HP            int64
MetColor     object
Automatic    object
CC            int64
Doors         int64
Weight        int64
dtype: object

In [20]:
locs = df.dtypes == 'int64'

In [23]:
pvals = df.loc[:, locs].apply(func = corr)

In [24]:
pvals >= 0.05

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
pearson,False,False,False,False,True,False,False


In [25]:
coef = lambda x: pg.corr(x = x, y = df['Price'])['r']

In [26]:
df.loc[:, locs].apply(func = coef)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
pearson,1.0,-0.797,-0.515,0.222,0.05,0.165,0.221


In [27]:
pg.normality(data = df, dv = 'Price', group = 'MetColor')

Unnamed: 0_level_0,W,pval,normal
MetColor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.975,0.0,False
0,0.988,0.001,False


In [28]:
pg.homoscedasticity(data = df, dv = 'Price', group = 'MetColor')

Unnamed: 0,W,pval,equal_var
levene,5.761,0.017,False


In [None]:
df.loc[df['MetColor'].eq('0'), 'Price']

In [29]:
sp1 = df['Price'][df['MetColor'].eq('0')]
sp2 = df['Price'][df['MetColor'].eq('1')]

In [30]:
print(sp1.mean())
print(sp2.mean())

9466.672897196262
9814.132142857143


In [43]:
df['MetColor'].value_counts()

MetColor
1    840
0    428
Name: count, dtype: int64

In [41]:
df.groupby(by = 'MetColor')['Price'].mean().diff()

MetColor
0       NaN
1   347.459
Name: Price, dtype: float64

In [32]:
pg.ttest(x = sp1, y = sp2, correction = False)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.892,1266,two-sided,0.004,"[-583.18, -111.73]",0.172,4.077,0.824


In [33]:
pg.ttest(x = sp1, y = sp2, correction = True)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.984,935.893,two-sided,0.003,"[-576.0, -118.92]",0.172,5.313,0.824


In [34]:
pg.mwu(x = sp1, y = sp2)

Unnamed: 0,U-val,alternative,p-val,RBC,CLES
MWU,166376.0,two-sided,0.03,0.074,0.463


In [35]:
# Automatic 범주별 Price의 정규성 가정 확인
pg.normality(data = df, dv = 'Price', group = 'Automatic')

Unnamed: 0_level_0,W,pval,normal
Automatic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.981,0.0,False
1,0.948,0.006,False


In [36]:
# Automatic 범주별 Price의 등분산 가정 확인
pg.homoscedasticity(data = df, dv = 'Price', group = 'Automatic')

Unnamed: 0,W,pval,equal_var
levene,1.093,0.296,True


In [37]:
# Automatic 범주별 Price를 sp1, sp2로 나눔
sp1 = df['Price'][df['Automatic'].eq('0')]
sp2 = df['Price'][df['Automatic'].eq('1')]

In [44]:
df['Automatic'].value_counts()

Automatic
0    1198
1      70
Name: count, dtype: int64

In [42]:
df.groupby(by = 'Automatic')['Price'].mean().diff()

Automatic
0       NaN
1   367.282
Name: Price, dtype: float64

In [38]:
# 등분산 가정 만족 여부에 따라 독립표본 t-검정 실행
pg.ttest(x = sp1, y = sp2, correction = False)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.473,1266,two-sided,0.141,"[-856.55, 121.99]",0.181,0.377,0.313


In [39]:
# mwu 검정도 실행
pg.mwu(x = sp1, y = sp2)

Unnamed: 0,U-val,alternative,p-val,RBC,CLES
MWU,37216.5,two-sided,0.113,0.112,0.444


In [45]:
df1 = pd.read_csv('https://bit.ly/sample_ttest')

In [46]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      30 non-null     int64  
 1   before  30 non-null     float64
 2   after   30 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 852.0 bytes


In [47]:
df1.head()

Unnamed: 0,id,before,after
0,1,88.41,89.048
1,2,84.74,83.796
2,3,106.925,105.783
3,4,93.277,89.966
4,5,104.576,103.411


In [48]:
df1.describe()

Unnamed: 0,id,before,after
count,30.0,30.0,30.0
mean,15.5,86.331,85.325
std,8.803,9.918,9.779
min,1.0,67.504,65.463
25%,8.25,80.124,79.998
50%,15.5,86.618,86.149
75%,22.75,91.69,90.26
max,30.0,106.925,105.783


In [49]:
stats.shapiro(df1['before'])

ShapiroResult(statistic=0.9858832359313965, pvalue=0.9512473940849304)

In [50]:
stats.shapiro(df1['before'])[1]

0.951

In [51]:
pg.ttest(x = df1['before'], y = 90)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.026,29,two-sided,0.052,"[82.63, 90.03]",0.37,1.159,0.5


In [52]:
pg.ttest(x = df1['before'], y = 90, alternative = 'less')

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.026,29,less,0.026,"[-inf, 89.41]",0.37,2.318,0.631


In [53]:
df1.apply(func = stats.shapiro, axis = 0)

Unnamed: 0,id,before,after
0,0.957,0.986,0.985
1,0.266,0.951,0.94


In [54]:
df1.apply(func = lambda x: stats.shapiro(x)[1], axis = 0)

id       0.266
before   0.951
after    0.940
dtype: float64

In [58]:
df2 = df1.melt(id_vars = 'id')
df2.head()

Unnamed: 0,id,variable,value
0,1,before,88.41
1,2,before,84.74
2,3,before,106.925
3,4,before,93.277
4,5,before,104.576


In [59]:
pg.normality(data = df2, dv = 'value', group = 'variable')

Unnamed: 0_level_0,W,pval,normal
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
before,0.986,0.951,True
after,0.985,0.94,True


In [61]:
pg.ttest(x = df1['before'], y = df1['after'], paired = True)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.537,29,two-sided,0.0,"[0.63, 1.38]",0.102,3529.131,0.084


In [62]:
pg.ttest(x = df1['before'], y = df1['after'], paired = True, alternative = 'greater')

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.537,29,greater,0.0,"[0.7, inf]",0.102,7058.261,0.136


In [63]:
pg.normality(data = df, dv = 'Price', group = 'FuelType')

Unnamed: 0_level_0,W,pval,normal
FuelType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Diesel,0.956,0.0,False
Petrol,0.978,0.0,False
CNG,0.952,0.496,True


In [64]:
pg.homoscedasticity(data = df, dv = 'Price', group = 'FuelType')

Unnamed: 0,W,pval,equal_var
levene,6.498,0.002,False


In [65]:
pg.anova(data = df, dv = 'Price', between = 'FuelType')

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,FuelType,2,1265,3.983,0.019,0.006


In [66]:
pg.welch_anova(data = df, dv = 'Price', between = 'FuelType')

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,FuelType,2,38.942,2.612,0.086,0.006


In [67]:
pg.kruskal(data = df, dv = 'Price', between = 'FuelType')

Unnamed: 0,Source,ddof1,H,p-unc
Kruskal,FuelType,2,10.475,0.005


In [68]:
import scikit_posthocs as sp

In [69]:
sp.posthoc_tukey(a = df, val_col = 'Price', group_col = 'FuelType')

Unnamed: 0,Diesel,Petrol,CNG
Diesel,1.0,0.016,0.9
Petrol,0.016,1.0,0.759
CNG,0.9,0.759,1.0


In [70]:
sp.posthoc_scheffe(a = df, val_col = 'Price', group_col = 'FuelType')

Unnamed: 0,Diesel,Petrol,CNG
Diesel,1.0,0.022,0.934
Petrol,0.022,1.0,0.798
CNG,0.934,0.798,1.0


In [77]:
df3 = df[df['FuelType'].eq('CNG')]
df4 = pd.concat(objs = [df, df3, df3, df3, df3])

In [78]:
pg.homoscedasticity(data = df4, dv = 'Price', group = 'FuelType')

Unnamed: 0,W,pval,equal_var
levene,7.641,0.001,False


In [79]:
pg.welch_anova(data = df4, dv = 'Price', between = 'FuelType')

Unnamed: 0,Source,ddof1,ddof2,F,p-unc,np2
0,FuelType,2,148.336,3.138,0.046,0.007


In [81]:
df2 = pd.read_csv('https://bit.ly/sample_cross')

In [82]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Coupon    1000 non-null   object
 1   Purchase  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [83]:
df2.head()

Unnamed: 0,Coupon,Purchase
0,발송함,구매안함
1,발송안함,구매안함
2,발송함,구매안함
3,발송안함,구매안함
4,발송함,구매안함


In [84]:
df2.describe()

Unnamed: 0,Coupon,Purchase
count,1000,1000
unique,2,2
top,발송함,구매안함
freq,500,900


In [89]:
pd.crosstab(
    index = df2['Coupon'], 
    columns = df2['Purchase'], 
    margins = True, 
    margins_name = '합계', 
    normalize = 'index'
)

Purchase,구매안함,구매함
Coupon,Unnamed: 1_level_1,Unnamed: 2_level_1
발송안함,0.98,0.02
발송함,0.82,0.18
합계,0.9,0.1


In [91]:
test = pg.chi2_independence(data = df2, x = 'Coupon', y = 'Purchase')

In [92]:
test[2]

Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,69.344,1.0,0.0,0.263,1.0
1,cressie-read,0.667,71.114,1.0,0.0,0.267,1.0
2,log-likelihood,0.0,78.386,1.0,0.0,0.28,1.0
3,freeman-tukey,-0.5,88.485,1.0,0.0,0.297,1.0
4,mod-log-likelihood,-1.0,104.805,1.0,0.0,0.324,1.0
5,neyman,-2.0,173.016,1.0,0.0,0.416,1.0


In [93]:
from statsmodels.stats.proportion import proportions_ztest

In [94]:
proportions_ztest(count = 30, nobs = 100, value = 0.2)

(2.182, 0.029)

In [95]:
proportions_ztest(count = [100, 150], nobs = [300, 500], value = 0.0)

(0.985, 0.325)

In [96]:
df = df.drop(columns = ['CC', 'Automatic'])
df = df[df['FuelType'].ne('CNG')]
df = df.reset_index(drop = True)

In [99]:
joblib.dump(value = df, filename = 'Used_Cars_Price_Prep.z')

['Used_Cars_Price_Prep.z']

In [100]:
df.to_csv('Used_Cars_Price_Prep.csv', index = False)