# Week 2 Statistical Foundations in Data Processing

## Descriptive Statistics

#### Mean

In [2]:
import numpy as np
import pandas as pd

# Load the Iris dataset

url = 'iris.csv'  # Change this to the path of your downloaded file
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv(url, header=None, names=column_names)
df = pd.DataFrame(data=iris_data, columns=column_names)

# Calculate the mean for each feature
mean_sepal_length = np.mean(iris_data['sepal_length'])
mean_sepal_width = np.mean(iris_data['sepal_width'])
mean_petal_length = np.mean(iris_data['petal_length'])
mean_petal_width = np.mean(iris_data['petal_width'])

print(f"Mean Sepal Length: {mean_sepal_length:.2f}")
print(f"Mean Sepal Width: {mean_sepal_width:.2f}")
print(f"Mean Petal Length: {mean_petal_length:.2f}")
print(f"Mean Petal Width: {mean_petal_width:.2f}")


Mean Sepal Length: 5.84
Mean Sepal Width: 3.05
Mean Petal Length: 3.76
Mean Petal Width: 1.20


In [3]:
# Calculate the median for each feature
median_sepal_length = np.median(iris_data['sepal_length'])
median_sepal_width = np.median(iris_data['sepal_width'])
median_petal_length = np.median(iris_data['petal_length'])
median_petal_width = np.median(iris_data['petal_width'])

print(f"Median Sepal Length: {median_sepal_length:.2f}")
print(f"Median Sepal Width: {median_sepal_width:.2f}")
print(f"Median Petal Length: {median_petal_length:.2f}")
print(f"Median Petal Width: {median_petal_width:.2f}")


Median Sepal Length: 5.80
Median Sepal Width: 3.00
Median Petal Length: 4.35
Median Petal Width: 1.30


In [4]:
# Determine the mode for each feature
mode_sepal_length = iris_data['sepal_length'].mode()
mode_sepal_width = iris_data['sepal_width'].mode()
mode_petal_length = iris_data['petal_length'].mode()
mode_petal_width = iris_data['petal_width'].mode()

print(f"Mode Sepal Length: {mode_sepal_length.values}")
print(f"Mode Sepal Width: {mode_sepal_width.values}")
print(f"Mode Petal Length: {mode_petal_length.values}")
print(f"Mode Petal Width: {mode_petal_width.values}")


Mode Sepal Length: [5.]
Mode Sepal Width: [3.]
Mode Petal Length: [1.5]
Mode Petal Width: [0.2]


In [5]:
# Calculate the range for each feature
range_sepal_length = np.max(iris_data['sepal_length']) - np.min(iris_data['sepal_length'])
range_sepal_width = np.max(iris_data['sepal_width']) - np.min(iris_data['sepal_width'])
range_petal_length = np.max(iris_data['petal_length']) - np.min(iris_data['petal_length'])
range_petal_width = np.max(iris_data['petal_width']) - np.min(iris_data['petal_width'])

print(f"Range Sepal Length: {range_sepal_length:.2f}")
print(f"Range Sepal Width: {range_sepal_width:.2f}")
print(f"Range Petal Length: {range_petal_length:.2f}")
print(f"Range Petal Width: {range_petal_width:.2f}")


Range Sepal Length: 3.60
Range Sepal Width: 2.40
Range Petal Length: 5.90
Range Petal Width: 2.40


In [6]:
# Calculate the variance for each feature
variance_sepal_length = np.var(iris_data['sepal_length'])
variance_sepal_width = np.var(iris_data['sepal_width'])
variance_petal_length = np.var(iris_data['petal_length'])
variance_petal_width = np.var(iris_data['petal_width'])

print(f"Variance Sepal Length: {variance_sepal_length:.2f}")
print(f"Variance Sepal Width: {variance_sepal_width:.2f}")
print(f"Variance Petal Length: {variance_petal_length:.2f}")
print(f"Variance Petal Width: {variance_petal_width:.2f}")


Variance Sepal Length: 0.68
Variance Sepal Width: 0.19
Variance Petal Length: 3.09
Variance Petal Width: 0.58


In [7]:
# Calculate the standard deviation for each feature
std_sepal_length = np.std(iris_data['sepal_length'])
std_sepal_width = np.std(iris_data['sepal_width'])
std_petal_length = np.std(iris_data['petal_length'])
std_petal_width = np.std(iris_data['petal_width'])

print(f"Standard Deviation Sepal Length: {std_sepal_length:.2f}")
print(f"Standard Deviation Sepal Width: {std_sepal_width:.2f}")
print(f"Standard Deviation Petal Length: {std_petal_length:.2f}")
print(f"Standard Deviation Petal Width: {std_petal_width:.2f}")


Standard Deviation Sepal Length: 0.83
Standard Deviation Sepal Width: 0.43
Standard Deviation Petal Length: 1.76
Standard Deviation Petal Width: 0.76


### Day 2 Data Preprocessing



In [17]:
import pandas as pd

# Correctly load the dataset
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv('iris_preprocess.csv', names=column_names)

iris_data.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  153 non-null    float64
 1   sepal_width   153 non-null    float64
 2   petal_length  153 non-null    object 
 3   petal_width   152 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(3), object(2)
memory usage: 6.1+ KB


In [18]:
# Remove instances with missing values
iris_data_cleaned = iris_data.dropna()

iris_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    object 
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(3), object(2)
memory usage: 7.0+ KB


In [26]:
# IMPUTATION: Fill missing values with the mean of each feature 

# Ensure to adjust this line according to whether your CSV file has headers or not
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv('iris_preprocess.csv', names=column_names)  # Remove 'header=None' if headers are in the first row of your CSV

# Compute means for numeric columns only
mean_values = iris_data.select_dtypes(include=[float, int]).mean()

# Apply imputation
data_imputed = iris_data.fillna(mean_values)
print(data_imputed)

     sepal_length  sepal_width    petal_length  petal_width           class
0             5.1          3.5             1.4     0.200000     Iris-setosa
1             4.9          3.0             1.4     0.200000     Iris-setosa
2             4.7          3.2             1.3     0.200000     Iris-setosa
3             4.6          3.1             1.5     0.200000     Iris-setosa
4             5.0          3.6             1.4     0.200000     Iris-setosa
..            ...          ...             ...          ...             ...
148           6.2          3.4             5.4     2.300000  Iris-virginica
149           5.9          3.0             5.1     1.800000  Iris-virginica
150           6.5          2.0  Iris-virginica     1.230263             NaN
151           6.5          3.2             5.1     2.000000             NaN
152           6.5          3.2             5.1     5.200000             NaN

[153 rows x 5 columns]


In [24]:
# Fill missing values with the mode of each feature
data_imputed = iris_data.fillna(iris_data.mode().iloc[0])
print(data_imputed)

     sepal_length  sepal_width    petal_length  petal_width           class
0             5.1          3.5             1.4          0.2     Iris-setosa
1             4.9          3.0             1.4          0.2     Iris-setosa
2             4.7          3.2             1.3          0.2     Iris-setosa
3             4.6          3.1             1.5          0.2     Iris-setosa
4             5.0          3.6             1.4          0.2     Iris-setosa
..            ...          ...             ...          ...             ...
148           6.2          3.4             5.4          2.3  Iris-virginica
149           5.9          3.0             5.1          1.8  Iris-virginica
150           6.5          2.0  Iris-virginica          0.2     Iris-setosa
151           6.5          3.2             5.1          2.0     Iris-setosa
152           6.5          3.2             5.1          5.2     Iris-setosa

[153 rows x 5 columns]


### Data Normalization

Data normalization is a technique used to scale the features of a dataset to a specific range, typically between 0 and 1. Normalization helps to ensure that all features contribute equally to the learning process and can improve the convergence of certain machine learning algorithms. The most common normalization technique is Min-Max scaling, which scales the features to a fixed range.Python code to perform Min-Max scaling using scikit-learn:



In [33]:
from sklearn.preprocessing import MinMaxScaler 
# Select only the numeric columns for scaling
numeric_columns = iris_data.select_dtypes(include=['float64', 'int64'])

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit and transform the numeric data
normalized_array = scaler.fit_transform(numeric_columns)

# Create a DataFrame from the normalized array and add the class column back
data_normalized = pd.DataFrame(normalized_array, columns=numeric_columns.columns)
data_normalized['class'] = iris_data['class']

In [32]:
print(data_normalized)

     sepal_length  sepal_width  petal_width           class
0        0.222222     0.625000     0.019608     Iris-setosa
1        0.166667     0.416667     0.019608     Iris-setosa
2        0.111111     0.500000     0.019608     Iris-setosa
3        0.083333     0.458333     0.019608     Iris-setosa
4        0.194444     0.666667     0.019608     Iris-setosa
..            ...          ...          ...             ...
148      0.527778     0.583333     0.431373  Iris-virginica
149      0.444444     0.416667     0.333333  Iris-virginica
150      0.611111     0.000000          NaN             NaN
151      0.611111     0.500000     0.372549             NaN
152      0.611111     0.500000     1.000000             NaN

[153 rows x 4 columns]


### Data Standardization

Data standardization is another technique used to transform the features of a dataset to have zero mean and unit variance. Standardization is useful when the features have different scales or when the algorithm assumes that the data follows a Gaussian distribution. Python code to perform standardization using scikit-learn:


In [37]:
from sklearn.preprocessing import StandardScaler 

# Select only the numeric columns for scaling
numeric_columns = iris_data.select_dtypes(include=['float64', 'int64'])

# Create a StandardScaler object 
scaler = StandardScaler() 


# Fit and transform the data 
data_standardized = scaler.fit_transform(numeric_columns)

In [38]:
print(data_standardized)

[[-0.91970918  1.03298043 -1.2498566 ]
 [-1.1629512  -0.11228048 -1.2498566 ]
 [-1.40619321  0.34582388 -1.2498566 ]
 [-1.52781422  0.1167717  -1.2498566 ]
 [-1.04133019  1.26203261 -1.2498566 ]
 [-0.55484616  1.94918916 -1.00722799]
 [-1.52781422  0.80392825 -1.1285423 ]
 [-1.04133019  0.80392825 -1.2498566 ]
 [-1.77105623 -0.34133266 -1.2498566 ]
 [-1.1629512   0.1167717  -1.37117091]
 [-0.55484616  1.4910848  -1.2498566 ]
 [-1.2845722   0.80392825 -1.2498566 ]
 [-1.2845722  -0.11228048 -1.37117091]
 [-1.89267724 -0.11228048 -1.37117091]
 [-0.06836213  2.17824134 -1.2498566 ]
 [-0.18998314  3.09445008 -1.00722799]
 [-0.55484616  1.94918916 -1.00722799]
 [-0.91970918  1.03298043 -1.1285423 ]
 [-0.18998314  1.72013698 -1.1285423 ]
 [-0.91970918  1.72013698 -1.1285423 ]
 [-0.55484616  0.80392825 -1.2498566 ]
 [-0.91970918  1.4910848  -1.00722799]
 [-1.52781422  1.26203261 -1.2498566 ]
 [-0.91970918  0.57487607 -0.88591368]
 [-1.2845722   0.80392825 -1.2498566 ]
 [-1.04133019 -0.11228048

### Applying Data Preprocessing

In [44]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler 

# Read the CSV file. Adjust the header parameter as needed based on the file structure.
# If the file contains headers, you should use header=0 or omit it, otherwise use header=None and provide column names.
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris_data = pd.read_csv('iris_preprocess.csv', header=None, names=column_names)

# Separate features and target
X = iris_data.drop('species', axis=1)
y = iris_data['species']

# Ensure all features are numeric (not strictly necessary here, but good practice for other data)
X = X.apply(pd.to_numeric, errors='coerce')  # Coerce errors will turn any non-numeric values to NaN, which you might handle by imputing

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Normalize the features
X_normalized = scaler.fit_transform(X)

# Optionally, convert the normalized features back to a DataFrame for better usability
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)


In [48]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset, make sure to correctly handle the header if it exists.
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris_data = pd.read_csv('iris_preprocess.csv', header=None, names=column_names)

# Print the data types to confirm the structure
print("Data types in the DataFrame:")
print(iris_data.dtypes)

# Separate features and target
X = iris_data.drop('species', axis=1)  # Dropping the 'species' column to keep only numeric features
y = iris_data['species']  # This is the categorical target variable

# Verify the content of X
print("First few rows of features (X):")
print(X.head())

# Create a StandardScaler object
scaler = StandardScaler()

# Standardize the features (ensure X contains only numeric columns)
try:
    X_standardized = scaler.fit_transform(X)
    # Optionally, convert the standardized features back to a DataFrame for better usability
    X_standardized_df = pd.DataFrame(X_standardized, columns=X.columns)
    print("Standardized features:")
    print(X_standardized_df.head())
except Exception as e:
    print("Error during scaling:", e)


Data types in the DataFrame:
sepal_length    float64
sepal_width     float64
petal_length     object
petal_width     float64
species          object
dtype: object
First few rows of features (X):
   sepal_length  sepal_width petal_length  petal_width
0           5.1          3.5          1.4          0.2
1           4.9          3.0          1.4          0.2
2           4.7          3.2          1.3          0.2
3           4.6          3.1          1.5          0.2
4           5.0          3.6          1.4          0.2
Error during scaling: could not convert string to float: 'Iris-virginica'
