In [2]:
import statistics
import numpy as np
import pandas as pd 
import seaborn as sns 
import scipy.stats as stats  
import matplotlib.pyplot as plt 

In [3]:
# create path variable for imported csv file
carmax_data_import_path = "Carmax - Main (Modified).csv"
# read in csv file as carmax_data_modified using import path variable
carmax_data_modified = pd.read_csv(carmax_data_import_path, index_col=0)
# create backup of carmax_data_modified as carmax_df
df = carmax_data_modified.copy()
df

Unnamed: 0,insert_num,vehicle_make,vehicle_model,vehicle_year,purchase_price,trade_in,financing,customer_age,customer_income,customer_gender,previous_purchase,distance_to_dealer,post_purchase_satisfaction,warranty,subsequent_purchases
0,81690,DODGE,CHARGER,2012,17501,True,True,25,50001,U,False,8.0,0.0,False,1
1,109994,FORD,F150,2007,17501,False,False,55,10000,F,True,19.0,0.0,False,0
2,11399,BMW,328,2010,27501,True,True,45,70001,F,True,21.0,0.0,False,0
3,214156,LEXUS,GS 300,2003,12501,False,True,25,30001,M,False,8.0,0.0,False,0
4,36685,CHEVROLET,CRUZE,2012,17501,True,True,35,130001,M,True,5.0,0.0,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355869,195314,JEEP,LIBERTY,2010,17501,False,True,25,30001,F,False,13.0,0.0,False,0
355870,58219,CHEVROLET,SONIC,2012,12501,True,False,75,0,U,False,2.0,0.0,False,0
355871,207386,KIA,SORENTO,2012,17501,True,True,65,30001,F,True,9.0,0.0,False,1
355872,252391,NISSAN,350Z,2003,12501,False,True,25,50001,M,False,5.0,0.0,False,0


# Measures of Center

In [4]:
mean = np.mean(df['purchase_price']) # find the mean
print("Mean: {}".format(mean)) 

Mean: 19241.313979104965


In [5]:
trim = stats.trim_mean(df['purchase_price'], 0.1) # find the trimmed mean
print("Trimmed Mean: {}".format(trim))

Trimmed Mean: 18248.084650509307


In [6]:
# bin categories
bins = pd.cut(df['purchase_price'], 10).value_counts()
print(bins) 

(9250.1, 18500.2]     219382
(18500.2, 27750.3]     98671
(27750.3, 37000.4]     15265
(37000.4, 46250.5]     10332
(-92.501, 9250.1]       9113
(46250.5, 55500.6]      2255
(55500.6, 64750.7]       581
(64750.7, 74000.8]       200
(74000.8, 83250.9]        54
(83250.9, 92501.0]        21
Name: purchase_price, dtype: int64


In [8]:
med = np.median(df['purchase_price']) # find the median
print("Median: {}".format(med))  

Median: 17501.0


In [9]:
mode = stats.mode(df['purchase_price'])# find the mode
print("Mode: {}".format(mode)) 

Mode: ModeResult(mode=array([17501], dtype=int64), count=array([118204]))


In [10]:
max = df['purchase_price'].max() # find the max value
print("Maximum: {}".format(max)) 

Maximum: 92501


In [11]:
min = df['purchase_price'].min() # find the min value
print("Minimum: {}".format(min)) 

Minimum: 0


# Measures of Variation
## The variance is the mean squared deviation of each data point from the mean.

In [12]:
# method 1
samp_var = np.var(df['purchase_price'], ddof=1) # find the sample variance
print("Sample Variance: {}".format(samp_var))

# method 1
pop_var = np.var(df['purchase_price']) # find the population variance
print("Population Variance: {}".format(pop_var)) 

Sample Variance: 55138380.41320919
Population Variance: 55138225.47528057


In [13]:
# method 2
diff = df['purchase_price'] - np.mean(df['purchase_price']) # subtract each data point from the mean
sq_diff = diff ** 2 # square the value of each difference
sum_sq_diff = np.sum(sq_diff) # sum the squared values of each difference
samp_var = sum_sq_diff / (355874 - 1) # divide the sum of the squared values by n-1
print("Sample Variance: {}".format(samp_var)) # print variance 

# method 2
diff = df['purchase_price'] - np.mean(df['purchase_price']) # subtract each data point from the mean
sq_diff = diff ** 2 # square the value of each difference
sum_sq_diff = np.sum(sq_diff) # sum the squared values of each difference
pop_var = sum_sq_diff / (355874) # divide the sum of the squared values by n-1
print("Population Variance: {}".format(pop_var)) # print variance 

Sample Variance: 55138380.41312469
Population Variance: 55138225.475196056


## The standard deviation is the square root of the mean squared deviation of each data point from the mean.

In [14]:
sample_sd = np.sqrt(np.var(df['purchase_price'], ddof=1)) # find the sample sd 
print("Sample Standard Deviation: {}".format(sample_sd))  

pop_sd = np.std(df['purchase_price'])
print("Population Standard Deviation: {}".format(pop_sd)) # find the population sd

Sample Standard Deviation: 7425.522231682375
Population Standard Deviation: 7425.511798878281


## The Mean Absolute Deviation is the mean deviation of each data point from the mean.

In [15]:
mad = df['purchase_price'] - np.mean(df['purchase_price']) # find the mean absolute deviation
print("Mean Absolute Deviation: {}".format(np.mean(np.abs(mad)))) 

Mean Absolute Deviation: 5590.155761067952


# Measures of Spread

# Z-scores
## This can be found by taking the difference of each actual value to the sample mean, then dividing those values by the sample standard deviation. 
z = (actual - mean) / sd 

In [16]:
# method 1
z = stats.zscore(df['purchase_price'], ddof=1) # find the z-score
print("Z-Scores: {}".format(z)) 

Z-Scores: [-0.23436924 -0.23436924  1.11233739 ... -0.23436924 -0.90772255
 -0.23436924]


In [17]:
# method 2
print("List of Z-Scores:")
diff = df['purchase_price'] - np.mean(df['purchase_price']) # subtract each data point from the mean
z = diff/df['purchase_price'].std() # divide the value of each difference by sd
print(z) 

List of Z-Scores:
0        -0.234369
1        -0.234369
2         1.112337
3        -0.907723
4        -0.234369
            ...   
355869   -0.234369
355870   -0.907723
355871   -0.234369
355872   -0.907723
355873   -0.234369
Name: purchase_price, Length: 355874, dtype: float64


In [18]:
# bin categories and frequency distribution 
print("Frequency Distribution:")
bins = pd.cut(df['purchase_price'], 10) 
print(bins.value_counts()) 

Frequency Distribution:
(9250.1, 18500.2]     219382
(18500.2, 27750.3]     98671
(27750.3, 37000.4]     15265
(37000.4, 46250.5]     10332
(-92.501, 9250.1]       9113
(46250.5, 55500.6]      2255
(55500.6, 64750.7]       581
(64750.7, 74000.8]       200
(74000.8, 83250.9]        54
(83250.9, 92501.0]        21
Name: purchase_price, dtype: int64


## Quantiles, Quartiles, Deciles

In [19]:
quan = np.quantile(df['purchase_price'], 0.5) # find the 50th percentile
print("Quantile: {}".format(quan)) 
quan_2 = np.quantile(df['purchase_price'], np.linspace(0.5, 0.5, 1)) # find 50th percentile 
print("Quantile: {}".format(quan_2))

quar = np.quantile(df['purchase_price'], [0.25, 0.5, 0.75, 1]) # find q1-q4
print("Quartiles: {}".format(quar))
quar_2 = np.quantile(df['purchase_price'], np.linspace(0.25, 1, 4)) # find q1-q10
print("Quartiles: {}".format(quar_2))

deci = np.quantile(df['purchase_price'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]) # find q1-q10
print("Deciles: {}".format(deci))
deci_2 = np.quantile(df['purchase_price'], np.linspace(0.1, 1, 10)) # find q1-q10
print("Deciles: {}".format(deci_2))

Quantile: 17501.0
Quantile: [17501.]
Quartiles: [12501. 17501. 22501. 92501.]
Quartiles: [12501. 17501. 22501. 92501.]
Deciles: [12501. 12501. 12501. 17501. 17501. 17501. 22501. 22501. 27501. 92501.]
Deciles: [12501. 12501. 12501. 17501. 17501. 17501. 22501. 22501. 27501. 92501.]


## Interquartile Range (IQR)

In [20]:
# method 1
iqr = np.quantile(df['purchase_price'], 0.75) - np.quantile(df['purchase_price'], 0.25) # find the IQR
print("IQR: {}".format(iqr))

IQR: 10000.0


In [21]:
# method 2
iqr = stats.iqr(df['purchase_price']) # find the IQR
print("IQR: {}".format(iqr))

IQR: 10000.0


## Outliers

In [22]:
## find any outliers
iqr = stats.iqr(df['purchase_price'])
print("IQR: {}".format(iqr))
lower = np.quantile(df['purchase_price'], 0.25) - (1.5 * iqr)
print("Lower Threshold: {}".format(lower))
upper = np.quantile(df['purchase_price'], 0.75) + (1.5 * iqr) 
print("Upper Threshold: {}".format(upper)) 

IQR: 10000.0
Lower Threshold: -2499.0
Upper Threshold: 37501.0


# Summary Statistics

In [23]:
summary = df['purchase_price'].describe()
print(summary) 

count    355874.000000
mean      19241.313979
std        7425.522232
min           0.000000
25%       12501.000000
50%       17501.000000
75%       22501.000000
max       92501.000000
Name: purchase_price, dtype: float64


# Random Sampling

In [24]:
# method 1
sample = df['customer_age'].sample() # generating a random sample from data set
print("Random Sample Value: {}".format(sample)) 

Random Sample Value: 177238    35
Name: customer_age, dtype: int64


In [25]:
# method 2
np.random.seed(sample) 
print("Random Sample Value: {}".format(sample.sample())) # generating a random sample from data set

Random Sample Value: 177238    35
Name: customer_age, dtype: int64


In [26]:
samps_wout_rplcmnt = df['customer_age'].sample(2) # sampling without replacement
print(samps_wout_rplcmnt)

39386     55
230737    25
Name: customer_age, dtype: int64


In [27]:
samps_wth_rplcmnt = df['customer_age'].sample(5, replace=True)  # sampling with replacement 
print(samps_wth_rplcmnt) 

71140     55
79236     25
123125    35
187767    35
201379    35
Name: customer_age, dtype: int64
