- Import Libraries

In [1]:
import numpy as np
import pandas as pd

- Read csv file

In [2]:
df = pd.read_csv('data/data.csv')

### 1. Show data


In [3]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,,5.0,1.9,Virginica
147,,,5.2,2.0,Virginica
148,,3.4,,2.3,Virginica


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  135 non-null    float64
 1   sepal.width   135 non-null    float64
 2   petal.length  135 non-null    float64
 3   petal.width   135 non-null    float64
 4   variety       135 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.isna().sum()

sepal.length    15
sepal.width     15
petal.length    15
petal.width     15
variety         15
dtype: int64

### 2. Handling missing data


In [6]:
df1 = df.copy()

- Use the attribute mean to fill in the missing continuous value

In [7]:
df1.fillna({'sepal.length': round(df['sepal.length'].mean())}, inplace=True)
df1.fillna({'sepal.width': round(df['sepal.width'].mean())}, inplace=True)
df1.fillna({'petal.length': round(df['petal.length'].mean())}, inplace=True)
df1.fillna({'petal.width': round(df['petal.width'].mean())}, inplace=True)

- Use the most occurring attribute to fill in the missing discrete value

In [8]:
df1.fillna({'variety':df['variety'].mode()[0]}, inplace=True)

In [9]:
df1.isna().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

In [10]:
df1

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,4.0,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,1.0,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,3.0,5.0,1.9,Virginica
147,6.0,3.0,5.2,2.0,Virginica
148,6.0,3.4,4.0,2.3,Virginica


### 3. Show rows with sepal.length > 5 and sepal.width > 3

In [11]:
df1[(df1['sepal.length']>5) & (df1['sepal.width']>3)]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,6.0,3.4,1.4,0.3,Setosa
7,6.0,3.4,1.5,0.2,Setosa
10,5.4,3.7,1.5,0.2,Setosa
15,5.7,4.4,1.5,1.0,Setosa
16,5.4,3.9,1.3,0.4,Setosa
17,5.1,3.5,1.4,0.3,Virginica
18,5.7,3.8,1.7,1.0,Setosa
19,5.1,3.8,1.5,0.3,Setosa


### 4. Data Transformation: Min-Max normalization

In [12]:
df2 = df1.copy()

In [13]:
sepal_length_min = df2['sepal.length'].min()
sepal_length_max = df2['sepal.length'].max()

sepal_width_min = df2['sepal.width'].min()
sepal_width_max = df2['sepal.width'].max()

petal_length_min = df2['petal.length'].min()
petal_length_max = df2['petal.length'].max()

petal_width_min = df2['petal.width'].min()
petal_width_max = df2['petal.width'].max()

In [14]:
df2['sepal.length'] = (df2['sepal.length'] - sepal_length_min)/(sepal_length_max - sepal_length_min)
df2['sepal.width'] = (df2['sepal.width'] - sepal_width_min)/(sepal_width_max - sepal_width_min)

df2['petal.length'] = (df2['petal.length'] - petal_length_min)/(petal_length_max - petal_length_min)
df2['petal.width'] = (df2['petal.width'] - petal_width_min)/(petal_width_max - petal_width_min)

In [15]:
df2

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,0.222222,0.625000,0.067797,0.041667,Setosa
1,0.166667,0.416667,0.508475,0.041667,Setosa
2,0.111111,0.500000,0.050847,0.041667,Setosa
3,0.083333,0.458333,0.084746,0.375000,Setosa
4,0.194444,0.666667,0.067797,0.041667,Setosa
...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,Virginica
146,0.555556,0.416667,0.677966,0.750000,Virginica
147,0.472222,0.416667,0.711864,0.791667,Virginica
148,0.472222,0.583333,0.508475,0.916667,Virginica


### 5. Make 'variety' attribute continuous

In [16]:
from sklearn import preprocessing

 - Label Encoding

In [17]:
df3 = df2.copy()

df3['variety_id'] = preprocessing.LabelEncoder().fit_transform(df3['variety'])

for x in df3['variety_id'].unique():
    print(x, df3['variety'][df3['variety_id']==x].unique()[0], end='\n')

0 Setosa
2 Virginica
1 Versicolor


In [18]:
df3.sample(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_id
92,0.416667,0.25,0.508475,0.458333,Versicolor,1
148,0.472222,0.583333,0.508475,0.916667,Virginica,2
74,0.583333,0.375,0.559322,0.5,Versicolor,1
76,0.694444,0.333333,0.644068,0.541667,Versicolor,1
110,0.611111,0.5,0.694915,0.791667,Virginica,2


### 6. Linear Regression to predict variety

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [20]:
df4 = df3.copy()

In [21]:
df4.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety_id
count,150.0,150.0,150.0,150.0,150.0
mean,0.43537,0.437222,0.46565,0.446111,1.106667
std,0.220878,0.174638,0.285184,0.302185,0.836633
min,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.333333,0.101695,0.09375,0.0
50%,0.458333,0.416667,0.508475,0.479167,1.0
75%,0.576389,0.541667,0.677966,0.708333,2.0
max,1.0,1.0,1.0,1.0,2.0


In [22]:
X = df4.values[:, :-2]
y = df4.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = LinearRegression()
model.fit(X_train, y_train)

In [24]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [ 0.26032119 -0.20670873  1.19789931  1.05961248]
Intercept: 0.06655172506189833


### 7. Get random sample with 50% repetition of data

In [25]:
sampled_df = df4.sample(frac=0.5, replace=True, random_state=42)
sampled_df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_id
102,0.777778,0.416667,0.830508,0.833333,Virginica,2
92,0.416667,0.250000,0.508475,0.458333,Versicolor,1
14,0.416667,0.416667,0.033898,0.041667,Setosa,0
106,0.166667,0.208333,0.593220,0.666667,Virginica,2
71,0.500000,0.333333,0.508475,0.500000,Virginica,2
...,...,...,...,...,...,...
23,0.222222,0.541667,0.118644,0.166667,Setosa,0
123,0.555556,0.291667,0.661017,0.708333,Virginica,2
40,0.194444,0.625000,0.050847,0.083333,Setosa,0
14,0.416667,0.416667,0.033898,0.041667,Setosa,0


### 8. Make 'sepal.length' attribute discrete

In [26]:
df4['sepal.length.discrete'] = pd.cut(df4['sepal.length'], bins=3, labels=['Short', 'Medium', 'Long'])
df4

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_id,sepal.length.discrete
0,0.222222,0.625000,0.067797,0.041667,Setosa,0,Short
1,0.166667,0.416667,0.508475,0.041667,Setosa,0,Short
2,0.111111,0.500000,0.050847,0.041667,Setosa,0,Short
3,0.083333,0.458333,0.084746,0.375000,Setosa,0,Short
4,0.194444,0.666667,0.067797,0.041667,Setosa,0,Short
...,...,...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,Virginica,2,Medium
146,0.555556,0.416667,0.677966,0.750000,Virginica,2,Medium
147,0.472222,0.416667,0.711864,0.791667,Virginica,2,Medium
148,0.472222,0.583333,0.508475,0.916667,Virginica,2,Medium


### 9. Data clustering and outlier removal

In [27]:
from sklearn.cluster import KMeans

In [35]:
kmeans = KMeans(n_clusters=3, random_state=42)

In [39]:
df5 = df4.copy()
df5['cluster'] = kmeans.fit_predict(df4.values[:, :4])
distances = kmeans.transform(df4.values[:, :4])
df5['distance_to_center'] = distances.min(axis=1)
df5.sample(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_id,sepal.length.discrete,cluster,distance_to_center
133,0.472222,0.333333,0.694915,0.375,Virginica,2,Medium,0,0.278664
106,0.166667,0.208333,0.59322,0.666667,Virginica,2,Short,0,0.429099
86,0.666667,0.458333,0.627119,0.583333,Versicolor,1,Medium,0,0.149744
128,0.583333,0.333333,0.779661,0.833333,Virginica,2,Medium,0,0.241131
116,0.611111,0.416667,0.762712,0.708333,Virginica,2,Medium,0,0.149162


In [40]:
threshold = df5['distance_to_center'].quantile(0.95)
df_without_outliers = df5[df5['distance_to_center'] < threshold]
df_without_outliers

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_id,sepal.length.discrete,cluster,distance_to_center
0,0.222222,0.625000,0.067797,0.041667,Setosa,0,Short,2,0.091862
1,0.166667,0.416667,0.508475,0.041667,Setosa,0,Short,1,0.352835
2,0.111111,0.500000,0.050847,0.041667,Setosa,0,Short,1,0.215371
3,0.083333,0.458333,0.084746,0.375000,Setosa,0,Short,1,0.273417
4,0.194444,0.666667,0.067797,0.041667,Setosa,0,Short,2,0.105704
...,...,...,...,...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,Virginica,2,Medium,0,0.311809
146,0.555556,0.416667,0.677966,0.750000,Virginica,2,Medium,0,0.125606
147,0.472222,0.416667,0.711864,0.791667,Virginica,2,Medium,0,0.192447
148,0.472222,0.583333,0.508475,0.916667,Virginica,2,Medium,0,0.388895
