In [3]:
import pandas as pd
import numpy as np


In [9]:
df = pd.read_csv(r"D:\tutorials\PROGRAMMING\Statistics For Machine Learning\USArrests.csv")
df

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6
5,Colorado,7.9,204,78,38.7
6,Connecticut,3.3,110,77,11.1
7,Delaware,5.9,238,72,15.8
8,Florida,15.4,335,80,31.9
9,Georgia,17.4,211,60,25.8


In [8]:
from scipy.stats import iqr, skew, kurtosis

In [10]:
iqr(df[['Assault', 'Fraud']], axis = 0)

array([140. ,  11.1])

In [13]:
kurtosis(df[['Assault', 'Fraud']], axis = 0, fisher = False) #Pearson Kurtosis

array([1.93097995, 3.20189779])

In [14]:
kurtosis(df[['Assault', 'Fraud']], axis = 0, fisher = True) #Fishers Kurtosis

array([-1.06902005,  0.20189779])

In [15]:
skew(df[['Assault', 'Fraud']], axis = 0)  #skewness greater than 0, so it is Left Skewed or Positively Skewed

array([0.22731787, 0.77696132])

In [16]:
# Determine ZScore

from scipy.stats import zscore
zscore(df['Assault'])

0     0.790787
1     1.118060
2     1.493817
3     0.233212
4     1.275635
5     0.402909
6    -0.736484
7     0.815030
8     1.990786
9     0.487757
10   -1.512241
11   -0.615272
12    0.948363
13   -0.700121
14   -1.391029
15   -0.675878
16   -0.748605
17    0.948363
18   -1.063757
19    1.566544
20   -0.263757
21    1.021090
22   -1.197090
23    1.069575
24    0.087757
25   -0.748605
26   -0.833454
27    0.984726
28   -1.378908
29   -0.142545
30    1.384726
31    1.008969
32    2.015028
33   -1.524362
34   -0.615272
35   -0.239515
36   -0.142545
37   -0.784969
38    0.039273
39    1.311999
40   -1.027393
41    0.208970
42    0.366545
43   -0.615272
44   -1.487999
45   -0.178909
46   -0.312242
47   -1.087999
48   -1.427393
49   -0.118303
Name: Assault, dtype: float64

In [27]:
# Based on IQR, determine the outliers for URBANPOP
# Create a custom function to find values of outliers

def outlier_treat(column):
    q3 = np.quantile(df[column], 0.75)
    q1 = np.quantile(df[column], 0.25)
    IQR = q3-q1
    lower_limit = q1 - (1.5*IQR)
    upper_limit = q3 + (1.5*IQR)
    print(f'q3: {q3}, q1: {q1}, IQR: {IQR}')
    print(f'Lower limit: {lower_limit}, Upper Limit: {upper_limit}')
    return df[ (df[column] < lower_limit) | (df[column] > upper_limit)]   # filters records where specified column value 
                                                                          #is less than lower limit or higher than upper limit

In [28]:
outlier_treat('Assault')

q3: 249.0, q1: 109.0, IQR: 140.0
Lower limit: -101.0, Upper Limit: 459.0


Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud


In [29]:
outlier_treat('UrbanPop')

q3: 77.75, q1: 54.5, IQR: 23.25
Lower limit: 19.625, Upper Limit: 112.625


Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud
20,Massachusetts,4.4,149,30000,16.3


In [30]:
outlier_treat('Murder')

q3: 11.25, q1: 4.075, IQR: 7.175
Lower limit: -6.687499999999999, Upper Limit: 22.0125


Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud


In [31]:
outlier_treat('Fraud')

q3: 26.175, q1: 15.075, IQR: 11.100000000000001
Lower limit: -1.5750000000000028, Upper Limit: 42.825


Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud
1,Alaska,10.0,263,48,44.5
27,Nevada,12.2,252,81,46.0


In [33]:
# finding outlier based on Z-score
z  = np.abs(zscore(df['UrbanPop']))
z

0     0.144561
1     0.146947
2     0.139311
3     0.146470
4     0.136687
5     0.139788
6     0.140027
7     0.141220
8     0.139311
9     0.144083
10    0.138595
11    0.145515
12    0.138595
13    0.142890
14    0.144799
15    0.142652
16    0.145992
17    0.142652
18    0.146231
19    0.142413
20    6.999961
21    0.140743
22    0.142652
23    0.147901
24    0.141697
25    0.145754
26    0.143606
27    0.139073
28    0.145038
29    0.137164
30    0.141697
31    0.137880
32    0.147663
33    0.147901
34    0.140504
35    0.142175
36    0.142413
37    0.141220
38    0.137641
39    0.146947
40    0.147663
41    0.144322
42    0.139311
43    0.139311
44    0.150765
45    0.143368
46    0.140982
47    0.149094
48    0.142652
49    0.144083
Name: UrbanPop, dtype: float64

In [34]:
np.where(z > 2)

(array([20], dtype=int64),)

In [36]:
df.iloc[20]

Unnamed: 0    Massachusetts
Murder                  4.4
Assault                 149
UrbanPop              30000
Fraud                  16.3
Name: 20, dtype: object

In [41]:
# Get the Range
print(f"Range for UrbanPop is: {np.ptp(df['UrbanPop'])}")
print(f"Range for Assault is: {np.ptp(df['Assault'])}")

Range for UrbanPop is: 29968
Range for Assault is: 292


In [44]:
df[['Assault', 'Fraud', 'UrbanPop']].std()

Assault       83.337661
Fraud          9.366385
UrbanPop    4233.451910
dtype: float64