# Task 01

## Data Cleaning and PreProcessing

### Data Gathering

### Data Cleaning

In [1]:
# Importing libraries for Cleaning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
# Reading the Dataset
df = pd.read_csv("genome_sequences.csv")
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,R,S,T,U,V,W,X,Y,Z,Label
0,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
1,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
2,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
3,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
4,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope


This shows that we have 27 columns. The 26 columns shows occurence of each letter from A to Z. The Label column shows the type of Virus.

In [4]:
column_to_drop = ['B', 'J', 'O', 'U', 'Z']
df1 = df.drop(columns=column_to_drop)
df1.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,P,Q,R,S,T,V,W,X,Y,Label
0,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,0,Envelope
1,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,0,Envelope
2,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,0,Envelope
3,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,0,Envelope
4,3,0,0,1,3,1,0,2,0,9,...,0,0,1,1,4,5,0,0,0,Envelope


## Outliers

Outliers can affect the data very severely. So, Removing them is necessary to keep our data clean.

### Using plotly 

In [8]:
import plotly.express as px
import pandas as pd

# Assuming numeric_df is defined as:
# numeric_df = df1.select_dtypes(include='number')
numeric_df = df1.select_dtypes(include='number')  # Automatically picks numerical columns

# Convert the dataframe from wide to long format for Plotly
long_df = numeric_df.melt(var_name='Letters', value_name='Occurrences')

# Create box plot with Plotly Express
fig = px.box(
    long_df,
    x='Letters',
    y='Occurrences',
    title='Box Plot of Letter Occurrences in Genome Sequences'
)

# Update layout for better spacing and rotated x-axis labels
fig.update_layout(
    xaxis_title='Letters',
    yaxis_title='Occurrences',
    xaxis_tickangle=45,
    width=900,
    height=500,
    margin=dict(t=50, b=150)
)

fig.show()


The dataset seems to have a lot of outliers. Using IQR Range to remove Outliers.

In [10]:
numeric_cols = df1.select_dtypes(include='number').columns

# IQR-based filtering
for col in numeric_cols:
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df1 = df1[(df1[col] >= lower_bound) & (df1[col] <= upper_bound)]

### Box_Plot Using Plotly

In [42]:
import plotly.express as px
import pandas as pd

# Select only numerical columns
numeric_df = df1.select_dtypes(include='number')

# Convert the wide-format DataFrame to long-format for Plotly
long_df = numeric_df.melt(var_name='Letters', value_name='Occurrences')

# Create an interactive box plot
fig = px.box(
    long_df,
    x='Letters',
    y='Occurrences',
    title='Box Plot of Letter Occurrences in Genome Sequences'
)

# Update layout for appearance similar to original matplotlib plot
fig.update_layout(
    xaxis_title='Letters',
    yaxis_title='Occurrences',
    xaxis_tickangle=45,
    width=1200,
    height=600,
    margin=dict(t=60, b=120)
)

# Show plot
fig.show()


Now, our data is clean and outliers are removed. However, the column 'X' has lost all its values. Dropping it since it does not provide any information now.

In [14]:
df1 = df1.drop(columns = ['X'])
df1.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,Label
0,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,Envelope
1,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,Envelope
2,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,Envelope
3,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,Envelope
4,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,Envelope


### Label Encoding

Moving on to our labels which are in categorical format. This will be converted into numerical format using label encoding.

In [15]:
label_encoder = LabelEncoder()

# Apply encoding to 'label' column
df1['label_encoded'] = label_encoder.fit_transform(df1['Label'])

# Optional: See the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)



Label Mapping: {'Envelope': np.int64(0), 'Membrane': np.int64(1), 'Nucleocapsid': np.int64(2), 'Spike': np.int64(3)}


Now, our dataet is cleaned. We can save it to a new csv file.

In [16]:
df1 = df1.drop(columns=['Label'])
df1.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,label_encoded
0,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,0
1,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,0
2,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,0
3,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,0
4,3,0,0,1,3,1,0,2,0,9,...,1,0,0,1,1,4,5,0,0,0


In [17]:
df1.to_csv("genome_sequences_cleaned.csv", index=False)