-
Notifications
You must be signed in to change notification settings - Fork 1
/
functions.py
32 lines (28 loc) · 1.21 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 🔄 Functions
def open_data(data): # returns shape, data types & shows a small sample
print(f"Data shape is {data.shape}.")
print()
print(data.dtypes)
print()
print("Data row sample and full columns:")
return data.sample(5)
def explore_data(data): # sum & returns duplicates, NaN & empty spaces
duplicate_rows = data.duplicated().sum()
nan_values = data.isna().sum()
empty_spaces = data.eq(' ').sum()
import pandas as pd
exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
return exploration
def snake_columns(data): # snake_case columns
data.columns = [column.lower().replace(' ', '_') for column in data.columns]
return data.sample(0)
def outlier_slayer(data): # automatically removes outliers based on Q1, Q3
for column in data.select_dtypes(include=[np.number]):
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
return data