# Exploratory Data Analysis (EDA)

## Data Description

### Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

### Dataset Description

In [3]:
# Load the CSV file into a DataFrame
df = pd.read_csv('genome_sequences.csv')

In [4]:
# Display first 5 rows
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,S,T,U,V,W,X,Y,Z,Label,Sequence_Length
0,3,0,0,0,1,3,1,0,2,0,...,1,4,0,5,0,0,0,0,Envelope,31
1,3,0,0,0,1,3,1,0,2,0,...,1,4,0,5,0,0,0,0,Envelope,31
2,3,0,0,0,1,3,1,0,2,0,...,1,4,0,5,0,0,0,0,Envelope,31
3,3,0,0,0,1,3,1,0,2,0,...,1,4,0,5,0,0,0,0,Envelope,31
4,3,0,0,0,1,3,1,0,2,0,...,1,4,0,5,0,0,0,0,Envelope,31


In [5]:
# Shape of the dataset
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

Rows: 39833, Columns: 28


In [6]:
# Info about column types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39833 entries, 0 to 39832
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A                39833 non-null  int64 
 1   B                39833 non-null  int64 
 2   C                39833 non-null  int64 
 3   D                39833 non-null  int64 
 4   E                39833 non-null  int64 
 5   F                39833 non-null  int64 
 6   G                39833 non-null  int64 
 7   H                39833 non-null  int64 
 8   I                39833 non-null  int64 
 9   J                39833 non-null  int64 
 10  K                39833 non-null  int64 
 11  L                39833 non-null  int64 
 12  M                39833 non-null  int64 
 13  N                39833 non-null  int64 
 14  O                39833 non-null  int64 
 15  P                39833 non-null  int64 
 16  Q                39833 non-null  int64 
 17  R                39833 non-null

In [7]:
# List all column names
df.columns.tolist()

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'Label',
 'Sequence_Length']

In [8]:
# Summary for numeric columns
df.describe()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,R,S,T,U,V,W,X,Y,Z,Sequence_Length
count,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,...,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0,39833.0
mean,36.957221,0.000251,8.577511,21.756057,19.61788,21.796074,35.0658,7.320789,24.186704,0.000126,...,22.347174,34.352999,32.700449,2.5e-05,30.611001,5.93538,3.015188,17.268973,0.000201,466.320237
std,52.649482,0.015843,23.480743,39.530526,38.791188,37.603654,47.934697,15.066657,38.849058,0.011203,...,28.159703,50.961392,56.636904,0.00501,62.692793,10.002605,45.555648,33.627932,0.014171,761.15275
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
25%,12.0,0.0,0.0,5.0,6.0,6.0,12.0,2.0,7.0,0.0,...,6.0,10.0,10.0,0.0,8.0,2.0,0.0,4.0,0.0,195.0
50%,26.0,0.0,3.0,11.0,11.0,13.0,25.0,4.0,14.0,0.0,...,20.0,22.0,19.0,0.0,13.0,5.0,0.0,9.0,0.0,268.0
75%,40.0,0.0,7.0,24.0,16.0,17.0,42.0,6.0,21.0,0.0,...,29.0,35.0,32.0,0.0,25.0,7.0,0.0,14.0,0.0,416.0
max,778.0,1.0,229.0,465.0,972.0,352.0,413.0,242.0,391.0,1.0,...,452.0,738.0,646.0,1.0,602.0,131.0,2270.0,336.0,1.0,8799.0


In [9]:
# Summary for target column
df.describe(include='object')

Unnamed: 0,Label
count,39833
unique,4
top,Membrane
freq,10018


In [10]:
# Count of missing values per column
df.isnull().sum()

A                  0
B                  0
C                  0
D                  0
E                  0
F                  0
G                  0
H                  0
I                  0
J                  0
K                  0
L                  0
M                  0
N                  0
O                  0
P                  0
Q                  0
R                  0
S                  0
T                  0
U                  0
V                  0
W                  0
X                  0
Y                  0
Z                  0
Label              0
Sequence_Length    0
dtype: int64

In [11]:
# Unique values per column
df.nunique()

A                   266
B                     2
C                   119
D                   193
E                   218
F                   167
G                   254
H                   100
I                   232
J                     2
K                   229
L                   285
M                   130
N                   188
O                     1
P                   183
Q                   148
R                   193
S                   264
T                   260
U                     2
V                   274
W                    87
X                   206
Y                   142
Z                     2
Label                 4
Sequence_Length    1111
dtype: int64

In [13]:
# For categorical columns, plot value counts
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"\nValue Counts for: {col}")
    print(df[col].value_counts())


Value Counts for: Label
Label
Membrane        10018
Nucleocapsid    10018
Spike           10018
Envelope         9779
Name: count, dtype: int64


## Visualizations

### 1. Bar Plot of Total Letter Frequencies Across Dataset

In [14]:
letter_columns = [column for column in df.columns if column != 'Label']

# Calculate total frequency for each letter and sort in descending order
letter_frequencies = df[letter_columns].sum().sort_values(ascending=False)

# Create a bar chart using Plotly Express
fig = px.bar(
    x=letter_frequencies.index,
    y=letter_frequencies.values,
    labels={'x': 'Letter', 'y': 'Total Count'},
    title='Total Frequency of Each Letter (A-Z)',
    color=letter_frequencies.index,
    color_discrete_sequence=px.colors.sequential.Viridis
)

# Display the figure
fig.show()

###  2. Boxplot of Each Letter Frequency

In [24]:
import plotly.graph_objects as go

# Select all columns except 'Label'
letter_columns = df.columns[df.columns != 'Label']

# Initialize the figure
fig = go.Figure()

# Loop through each letter column and add its box trace
[fig.add_trace(
    go.Box(
        y=df[col],
        name=col,
        boxpoints='outliers'
    )
) for col in letter_columns]

# Customize layout
fig.update_layout(
    title_text='Boxplot of Letter Frequencies (A-Z)',
    xaxis=dict(title='Letter', tickangle=-90),
    yaxis=dict(title='Frequency'),
    width=900,
    height=450
)

# Display the plot
fig.show()


### 3. Correlation Heatmap Between Letters

In [16]:
import plotly.express as px

# Extract all columns except 'Label' for correlation
letter_columns = df.columns.difference(['Label'])

# Compute correlation matrix
corr_matrix = df.loc[:, letter_columns].corr()

# Create heatmap using Plotly Express
fig = px.imshow(
    img=corr_matrix.values,
    x=letter_columns,
    y=letter_columns,
    color_continuous_scale='RdBu_r',
    labels={"color": "Correlation"},
    title="Correlation Heatmap Between Letter Frequencies",
    zmin=-1,
    zmax=1
)

# Update layout for better visualization
fig.update_layout(
    width=700,
    height=700,
    xaxis_title="Letter",
    yaxis_title="Letter"
)

# Show the heatmap
fig.show()


###  4. Countplot of Target Classes

In [17]:
import plotly.express as px

# Create histogram to show target class distribution
fig = px.histogram(
    data_frame=df,
    x='Label',
    color='Label',
    title='Target Class Distribution',
    labels={'Label': 'Target', 'count': 'Count'},
    color_discrete_sequence=px.colors.qualitative.Set2
)

# Customize layout settings
fig.update_layout(
    width=600,
    height=400,
    xaxis=dict(title='Target'),
    yaxis=dict(title='Count'),
    showlegend=False
)

# Display the histogram
fig.show()


### 5. Violin Plot: Frequency of Letter 'A' by Target Class

In [18]:
fig = px.violin(
    df,
    x='Label',
    y='A',
    color='Label',
    box=True,            # Show box plot inside violin
    points='all',        # Show all points
    color_discrete_sequence=px.colors.qualitative.Pastel,  # similar to seaborn 'muted'
    title="Distribution of 'A' by Target Class"
)

fig.update_layout(
    yaxis_title="A",
    xaxis_title="Label",
    width=800,
    height=500
)

fig.show()


### 6. Pairplot of Top 4 Frequent Letters Colored by Target

In [19]:
letter_columns = [col for col in df.columns if col != 'Label']
total_counts = df[letter_columns].sum().sort_values(ascending=False)

fig = px.bar(
    x=total_counts.index,
    y=total_counts.values,
    labels={'x': 'Letter', 'y': 'Total Count'},
    title='Total Frequency of Each Letter (A-Z)',
    color=total_counts.index,
    color_discrete_sequence=px.colors.sequential.Viridis
)
fig.show()

### 7. KDE Plot of Letter 'E' Across Target Classes

In [20]:
from scipy.stats import gaussian_kde
fig = go.Figure()

for label in df['Label'].unique():
    data = df[df['Label'] == label]['E']
    kde = gaussian_kde(data)
    x_range = np.linspace(data.min(), data.max(), 200)
    fig.add_trace(go.Scatter(
        x=x_range,
        y=kde(x_range),
        mode='lines',
        fill='tozeroy',
        name=f'Target {label}'
    ))

fig.update_layout(
    title="KDE Plot for Letter 'E' by Target",
    xaxis_title='Frequency of E',
    yaxis_title='Density',
    width=800,
    height=500
)

fig.show()

### 8. Heatmap of Mean Letter Frequencies Per Class

In [21]:
mean_by_class = df.groupby('Label')[letter_columns].mean()

fig = px.imshow(
    mean_by_class,
    color_continuous_scale='YlGnBu',
    text_auto='.1f',
    labels=dict(x='Letter', y='Target', color='Mean Frequency'),
    title='Mean Letter Frequency by Target Class',
    x=mean_by_class.columns,
    y=mean_by_class.index
)

fig.update_layout(
    width=900,
    height=600
)

fig.show()

### 9. Histogram of Letter 'T'

In [22]:
import plotly.graph_objects as go
import numpy as np
from scipy.stats import gaussian_kde

# Extract the 'T' column data
data = df.loc[:, 'T']

# Create histogram trace
hist = go.Histogram(
    x=data,
    nbinsx=30,
    name='Histogram',
    marker=dict(color='coral'),
    opacity=0.7,
    histnorm=''
)

# Perform KDE
kde = gaussian_kde(dataset=data)
x_range = np.linspace(start=data.min(), stop=data.max(), num=200)
bin_width = (data.max() - data.min()) / 30
kde_vals = kde.evaluate(x_range) * len(data) * bin_width

# Create KDE trace
kde_line = go.Scatter(
    x=x_range,
    y=kde_vals,
    mode='lines',
    name='KDE',
    line=dict(color='darkred')
)

# Combine both traces in a figure
fig = go.Figure()
fig.add_trace(hist)
fig.add_trace(kde_line)

# Update layout
fig.update_layout(
    title="Histogram of Letter 'T'",
    xaxis_title="Frequency of T",
    yaxis_title="Count",
    bargap=0.2,
    template='plotly_white',
    width=800,
    height=500
)

# Display the figure
fig.show()
