# Week 2 Statistical Foundations in Data Processing

## Descriptive Statistics

#### Mean

In [13]:
import numpy as np
import pandas as pd

# Load the Iris dataset

url = 'iris.csv'  # Change this to the path of your downloaded file
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv(url, header=None, names=column_names)
df = pd.DataFrame(data=iris_data, columns=column_names)

# Calculate the mean for each feature
mean_sepal_length = np.mean(iris_data['sepal_length'])
mean_sepal_width = np.mean(iris_data['sepal_width'])
mean_petal_length = np.mean(iris_data['petal_length'])
mean_petal_width = np.mean(iris_data['petal_width'])

print(f"Mean Sepal Length: {mean_sepal_length:.2f}")
print(f"Mean Sepal Width: {mean_sepal_width:.2f}")
print(f"Mean Petal Length: {mean_petal_length:.2f}")
print(f"Mean Petal Width: {mean_petal_width:.2f}")


Mean Sepal Length: 5.84
Mean Sepal Width: 3.05
Mean Petal Length: 3.76
Mean Petal Width: 1.20


In [14]:
# Calculate the median for each feature
median_sepal_length = np.median(iris_data['sepal_length'])
median_sepal_width = np.median(iris_data['sepal_width'])
median_petal_length = np.median(iris_data['petal_length'])
median_petal_width = np.median(iris_data['petal_width'])

print(f"Median Sepal Length: {median_sepal_length:.2f}")
print(f"Median Sepal Width: {median_sepal_width:.2f}")
print(f"Median Petal Length: {median_petal_length:.2f}")
print(f"Median Petal Width: {median_petal_width:.2f}")


Median Sepal Length: 5.80
Median Sepal Width: 3.00
Median Petal Length: 4.35
Median Petal Width: 1.30


In [15]:
# Determine the mode for each feature
mode_sepal_length = iris_data['sepal_length'].mode()
mode_sepal_width = iris_data['sepal_width'].mode()
mode_petal_length = iris_data['petal_length'].mode()
mode_petal_width = iris_data['petal_width'].mode()

print(f"Mode Sepal Length: {mode_sepal_length.values}")
print(f"Mode Sepal Width: {mode_sepal_width.values}")
print(f"Mode Petal Length: {mode_petal_length.values}")
print(f"Mode Petal Width: {mode_petal_width.values}")


Mode Sepal Length: [5.]
Mode Sepal Width: [3.]
Mode Petal Length: [1.5]
Mode Petal Width: [0.2]


In [16]:
# Calculate the range for each feature
range_sepal_length = np.max(iris_data['sepal_length']) - np.min(iris_data['sepal_length'])
range_sepal_width = np.max(iris_data['sepal_width']) - np.min(iris_data['sepal_width'])
range_petal_length = np.max(iris_data['petal_length']) - np.min(iris_data['petal_length'])
range_petal_width = np.max(iris_data['petal_width']) - np.min(iris_data['petal_width'])

print(f"Range Sepal Length: {range_sepal_length:.2f}")
print(f"Range Sepal Width: {range_sepal_width:.2f}")
print(f"Range Petal Length: {range_petal_length:.2f}")
print(f"Range Petal Width: {range_petal_width:.2f}")


Range Sepal Length: 3.60
Range Sepal Width: 2.40
Range Petal Length: 5.90
Range Petal Width: 2.40


In [17]:
# Calculate the variance for each feature
variance_sepal_length = np.var(iris_data['sepal_length'])
variance_sepal_width = np.var(iris_data['sepal_width'])
variance_petal_length = np.var(iris_data['petal_length'])
variance_petal_width = np.var(iris_data['petal_width'])

print(f"Variance Sepal Length: {variance_sepal_length:.2f}")
print(f"Variance Sepal Width: {variance_sepal_width:.2f}")
print(f"Variance Petal Length: {variance_petal_length:.2f}")
print(f"Variance Petal Width: {variance_petal_width:.2f}")


Variance Sepal Length: 0.68
Variance Sepal Width: 0.19
Variance Petal Length: 3.09
Variance Petal Width: 0.58


In [18]:
# Calculate the standard deviation for each feature
std_sepal_length = np.std(iris_data['sepal_length'])
std_sepal_width = np.std(iris_data['sepal_width'])
std_petal_length = np.std(iris_data['petal_length'])
std_petal_width = np.std(iris_data['petal_width'])

print(f"Standard Deviation Sepal Length: {std_sepal_length:.2f}")
print(f"Standard Deviation Sepal Width: {std_sepal_width:.2f}")
print(f"Standard Deviation Petal Length: {std_petal_length:.2f}")
print(f"Standard Deviation Petal Width: {std_petal_width:.2f}")


Standard Deviation Sepal Length: 0.83
Standard Deviation Sepal Width: 0.43
Standard Deviation Petal Length: 1.76
Standard Deviation Petal Width: 0.76


### Day 2 Data Preprocessing



In [27]:
import pandas as pd

# Correctly load the dataset
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv('iris_preprocess.csv', header=None, names=column_names)

iris_data.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  153 non-null    float64
 1   sepal_width   153 non-null    float64
 2   petal_length  153 non-null    object 
 3   petal_width   152 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(3), object(2)
memory usage: 6.1+ KB


In [30]:
# Remove instances with missing values
iris_data_cleaned = iris_data.dropna()

iris_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    object 
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(3), object(2)
memory usage: 7.0+ KB


In [32]:
# IMPUTATION: Fill missing values with the mean of each feature 

left off here...

data_imputed = iris_data.fillna(data.mean())

TypeError: Could not convert ['Iris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-setosaIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-versicolorIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginicaIris-virginica'] to numeric