Module 1: Python Basics and Libraries (NumPy, Pandas)
Task 1:
Complete basic exercises in Python and work with NumPy and Pandas to load, manipulate,
and analyze a sample dataset.
Assignment:
1. Download the Iris dataset and load it into a Pandas DataFrame.
2. Perform the following tasks:
○ Clean the data by removing any rows with missing values.
○ Generate basic statistical summaries (mean, median, standard deviation) of
the numeric columns.
○ Perform data transformation using NumPy (e.g., standardize one of the
columns).

Deliverables:

2

● A Python script that:
○ Loads the dataset.
○ Cleans the data.
○ Generates statistical summaries.
○ Transforms data using NumPy.

In [1]:
#Import the Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read a comma-separated file
df = pd.read_csv('resources/iris/iris.data', sep=',', header=None)

df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
# Specify column names (specified in iris.names file)
df.columns = ['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm', 'class']

print(df)

     sepal length in cm  sepal width in cm  petal length in cm  \
0                   5.1                3.5                 1.4   
1                   4.9                3.0                 1.4   
2                   4.7                3.2                 1.3   
3                   4.6                3.1                 1.5   
4                   5.0                3.6                 1.4   
..                  ...                ...                 ...   
145                 6.7                3.0                 5.2   
146                 6.3                2.5                 5.0   
147                 6.5                3.0                 5.2   
148                 6.2                3.4                 5.4   
149                 5.9                3.0                 5.1   

     petal width in cm           class  
0                  0.2     Iris-setosa  
1                  0.2     Iris-setosa  
2                  0.2     Iris-setosa  
3                  0.2     Iris-setosa  
4 

In [4]:
#visual better without putting into ()
df

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
df.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
#display columns
df.columns

Index(['sepal length in cm', 'sepal width in cm', 'petal length in cm',
       'petal width in cm', 'class'],
      dtype='object')

In [7]:
#display shape(row, column)
df.shape

(150, 5)

In [8]:
#check nulls (by columns)
df.isnull().sum()

sepal length in cm    0
sepal width in cm     0
petal length in cm    0
petal width in cm     0
class                 0
dtype: int64

In [9]:
#remove duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

(150, 5)
(147, 5)


In [10]:
#It’s easier to copy the column name and paste it to rename.
df.columns

Index(['sepal length in cm', 'sepal width in cm', 'petal length in cm',
       'petal width in cm', 'class'],
      dtype='object')

In [11]:
#Rename Columns
df = df.rename(columns={"sepal length in cm" : "sepal_length", "sepal width in cm": "sepal_width", "petal length in cm": "petal_length", "petal width in cm" : "petal_width"})
print(df)

     sepal_length  sepal_width  petal_length  petal_width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[147 rows x 5 columns]


In [12]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [13]:
# Summary statistics for numerical columns
df.describe() 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,147.0,147.0,147.0,147.0
mean,5.856463,3.055782,3.780272,1.208844
std,0.8291,0.437009,1.759111,0.757874
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


**Transforms data using NumPy**

Counting the number of counts of unique values using “value_counts()”. 
The value_counts() function, counts the number of times a particular instance or data has occurred.

In [14]:

#In this dataset will work on the Species column, it will count number of times a particular species has occurred. 
df["class"].value_counts() 
#it will display in descending order. 


class
Iris-versicolor    50
Iris-virginica     49
Iris-setosa        48
Name: count, dtype: int64

Filtering:Displaying the specific rows using “iloc” and “loc” functions. 

The “loc” functions use the index name of the row to display the particular row of the dataset. 
The “iloc” functions use the index integer of the row, which gives complete information about the row. 
 

In [15]:
# filter Iris-setosa
df.iloc[5] 
#it will display records only with species "Iris-setosa". 
filter_setosa = df.loc[df["class"] == "Iris-setosa"]
filter_setosa

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [16]:
#describing the setosa class
filter_setosa.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,48.0,48.0,48.0,48.0
mean,5.010417,3.43125,1.4625,0.25
std,0.359219,0.383243,0.177002,0.105185
min,4.3,2.3,1.0,0.1
25%,4.8,3.2,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.7,1.6,0.3
max,5.8,4.4,1.9,0.6


In [17]:
#it will display records only with species "virginica". 
filter_virginica = df.loc[df["class"] == "Iris-virginica"]

#describing the virginica class
filter_virginica.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,49.0,49.0,49.0,49.0
mean,6.604082,2.979592,5.561224,2.028571
std,0.632113,0.32338,0.553706,0.276887
min,4.9,2.2,4.5,1.4
25%,6.3,2.8,5.1,1.8
50%,6.5,3.0,5.6,2.0
75%,6.9,3.2,5.9,2.3
max,7.9,3.8,6.9,2.5


In [18]:
#it will display records only with species "Iris-versicolor". 
filter_versicolor = df.loc[df["class"] == "Iris-versicolor"]

#describing the versicolor class
filter_versicolor.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,5.936,2.77,4.26,1.326
std,0.516171,0.313798,0.469911,0.197753
min,4.9,2.0,3.0,1.0
25%,5.6,2.525,4.0,1.2
50%,5.9,2.8,4.35,1.3
75%,6.3,3.0,4.6,1.5
max,7.0,3.4,5.1,1.8


In [19]:
type(df.columns) #printing type of column

pandas.core.indexes.base.Index

**Saving the data**

In [21]:
#save data file 
# Save the DataFrame back to a csv file
df.to_csv('clean data/iris_plant_dataset.csv', index=False)  