In [2]:
import os
import joblib
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels import robust

In [3]:
df = pd.read_csv('https://bit.ly/Used_Cars_Price')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1275 non-null   int64 
 1   Age        1275 non-null   int64 
 2   KM         1275 non-null   int64 
 3   FuelType   1275 non-null   object
 4   HP         1275 non-null   int64 
 5   MetColor   1275 non-null   int64 
 6   Automatic  1275 non-null   int64 
 7   CC         1275 non-null   int64 
 8   Doors      1275 non-null   int64 
 9   Weight     1275 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 99.7+ KB


In [5]:
df.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23,46986,Diesel,90,1,0,2000,3,1165
1,13750,23,72937,Diesel,90,1,0,2000,3,1165
2,13950,24,41711,Diesel,90,1,0,2000,3,1165
3,14950,26,48000,Diesel,90,0,0,2000,3,1165
4,13750,30,38500,Diesel,90,0,0,2000,3,1170


In [6]:
df.columns

Index(['Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight'],
      dtype='object')

In [7]:
cols = ['MetColor', 'Automatic']

In [9]:
df[cols] = df[cols].astype(str)

In [10]:
df.dtypes

Price         int64
Age           int64
KM            int64
FuelType     object
HP            int64
MetColor     object
Automatic    object
CC            int64
Doors         int64
Weight        int64
dtype: object

In [11]:
df['Price'].mean()

9690.232941176471

In [12]:
stats.trim_mean(df['Price'], 0.1)

9584.380019588638

In [13]:
stats.trim_mean(df['Price'], 0.5)

9450.0

In [14]:
df['Price'].median()

9450.0

In [15]:
df['FuelType'].mode()

0    Petrol
Name: FuelType, dtype: object

In [16]:
df['FuelType'].value_counts()

Petrol    1129
Diesel     129
CNG         17
Name: FuelType, dtype: int64

In [17]:
df['FuelType'].value_counts().sort_index()

CNG         17
Diesel     129
Petrol    1129
Name: FuelType, dtype: int64

In [18]:
df['FuelType'].value_counts(normalize = True)

Petrol    0.885490
Diesel    0.101176
CNG       0.013333
Name: FuelType, dtype: float64

In [19]:
np.arange(start = 0, stop = 1, step = 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [20]:
np.arange(0, 1.1, 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [21]:
np.linspace(start = 0, stop = 1, num = 10+1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [22]:
np.linspace(start = 80, stop = 75, num = 30+1)

array([80.        , 79.83333333, 79.66666667, 79.5       , 79.33333333,
       79.16666667, 79.        , 78.83333333, 78.66666667, 78.5       ,
       78.33333333, 78.16666667, 78.        , 77.83333333, 77.66666667,
       77.5       , 77.33333333, 77.16666667, 77.        , 76.83333333,
       76.66666667, 76.5       , 76.33333333, 76.16666667, 76.        ,
       75.83333333, 75.66666667, 75.5       , 75.33333333, 75.16666667,
       75.        ])

In [23]:
df['Price'].quantile(np.linspace(0, 1, 100+1))

0.00     4350.0
0.01     5833.3
0.02     5950.0
0.03     6496.1
0.04     6650.0
         ...   
0.96    13510.0
0.97    13950.0
0.98    14179.6
0.99    14950.0
1.00    15950.0
Name: Price, Length: 101, dtype: float64

In [24]:
df['Price'].quantile(np.linspace(0, 1, 10+1))

0.0     4350.0
0.1     7250.0
0.2     7950.0
0.3     8500.0
0.4     8950.0
0.5     9450.0
0.6     9950.0
0.7    10500.0
0.8    11456.0
0.9    12500.0
1.0    15950.0
Name: Price, dtype: float64

In [25]:
df['Price'].quantile(np.linspace(0, 1, 4+1))

0.00     4350.0
0.25     8250.0
0.50     9450.0
0.75    10950.0
1.00    15950.0
Name: Price, dtype: float64

In [26]:
df['Price'].max() - df['Price'].min()

11600

In [27]:
df['Price'].quantile([0, 1])

0.0     4350.0
1.0    15950.0
Name: Price, dtype: float64

In [28]:
df['Price'].quantile([0, 1]).diff()

0.0        NaN
1.0    11600.0
Name: Price, dtype: float64

In [29]:
df['Price'].quantile([0, 1]).diff().iloc[-1]

11600.0

In [30]:
df['Price'].quantile([0.25, 0.75]).diff().iloc[-1]

2700.0

In [31]:
df['Price'].var()

4120265.326386555

In [32]:
df['Price'].var() ** (1/2)

2029.8436704304484

In [33]:
df['Price'].std()

2029.8436704304484

In [34]:
robust.mad(df['Price'])

2223.903327758403

In [36]:
df.describe().round(2)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
count,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0
mean,9690.23,60.6,73692.41,100.4,1559.51,4.0,1062.69
std,2029.84,13.69,35949.78,13.11,184.69,0.95,39.79
min,4350.0,19.0,1.0,69.0,1300.0,2.0,1000.0
25%,8250.0,51.0,48948.5,86.0,1400.0,3.0,1035.0
50%,9450.0,63.0,67451.0,110.0,1600.0,4.0,1060.0
75%,10950.0,71.5,90325.0,110.0,1600.0,5.0,1075.0
max,15950.0,80.0,243000.0,116.0,2000.0,5.0,1615.0


In [37]:
df.describe().apply(func = lambda x: x.round(2).astype(int), axis = 0)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
count,1275,1275,1275,1275,1275,1275,1275
mean,9690,60,73692,100,1559,4,1062
std,2029,13,35949,13,184,0,39
min,4350,19,1,69,1300,2,1000
25%,8250,51,48948,86,1400,3,1035
50%,9450,63,67451,110,1600,4,1060
75%,10950,71,90325,110,1600,5,1075
max,15950,80,243000,116,2000,5,1615


In [38]:
df.describe().astype(int)

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
count,1275,1275,1275,1275,1275,1275,1275
mean,9690,60,73692,100,1559,3,1062
std,2029,13,35949,13,184,0,39
min,4350,19,1,69,1300,2,1000
25%,8250,51,48948,86,1400,3,1035
50%,9450,63,67451,110,1600,4,1060
75%,10950,71,90325,110,1600,5,1075
max,15950,80,243000,116,2000,5,1615


In [40]:
imsi = df.sort_values(by = ['KM']).head()
imsi

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
433,7500,50,1,Diesel,90,1,0,2000,3,1260
1267,6950,76,1,Petrol,110,0,0,1600,5,1114
432,11250,47,5309,Petrol,110,1,1,1600,3,1070
18,14900,30,7000,Petrol,97,1,0,1400,5,1100
7,12950,29,9750,Petrol,97,1,0,1400,3,1100


In [42]:
imsi.iloc[0]

Price          7500
Age              50
KM                1
FuelType     Diesel
HP               90
MetColor          1
Automatic         0
CC             2000
Doors             3
Weight         1260
Name: 433, dtype: object

In [43]:
imsi.loc[433]

Price          7500
Age              50
KM                1
FuelType     Diesel
HP               90
MetColor          1
Automatic         0
CC             2000
Doors             3
Weight         1260
Name: 433, dtype: object

In [44]:
imsi.loc[imsi['KM'].eq(1)]

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
433,7500,50,1,Diesel,90,1,0,2000,3,1260
1267,6950,76,1,Petrol,110,0,0,1600,5,1114


In [45]:
imsi.iloc[imsi['KM'].eq(1)]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [49]:
imsi = imsi.reset_index(drop = True)
imsi

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,7500,50,1,Diesel,90,1,0,2000,3,1260
1,6950,76,1,Petrol,110,0,0,1600,5,1114
2,11250,47,5309,Petrol,110,1,1,1600,3,1070
3,14900,30,7000,Petrol,97,1,0,1400,5,1100
4,12950,29,9750,Petrol,97,1,0,1400,3,1100


In [50]:
df = df[df['KM'].gt(1)].reset_index(drop = True)

In [51]:
df.shape[0]

1273

In [52]:
os.getcwd()

'C:\\Users\\hdsce\\Documents\\PythonAdvanced-main\\live'

In [53]:
os.chdir(path = '../data')

In [54]:
os.listdir()

['Seafood_Trade_Prep.z', 'Used_Cars_Price.csv', 'Used_Cars_Price.xlsx']

In [58]:
%time df.to_excel('Used_Cars_Price.xlsx', index = None)

CPU times: total: 344 ms
Wall time: 508 ms


In [59]:
%time df.to_csv('Used_Cars_Price.csv', index = None)

CPU times: total: 0 ns
Wall time: 12.7 ms


In [60]:
%time joblib.dump(value = df, filename = 'Used_Cars_Price.z')

CPU times: total: 0 ns
Wall time: 5.28 ms


['Used_Cars_Price.z']

In [61]:
import seaborn as sns
import matplotlib.pyplot as plt

In [62]:
plt.rc(group = 'font', family = 'Gowun Dodum', size = 10)
plt.rc(group = 'figure', figsize = (4, 4), dpi = 100)
plt.rc(group = 'axes', unicode_minus = False)
plt.rc(group = 'legend', frameon = True, fc = '1', ec = '0')