# Exploratory Data Analysis (EDA)

## Data Description

### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

### Dataset Description

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('genome_sequences.csv')

In [3]:
# Display first 5 rows
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,R,S,T,U,V,W,X,Y,Z,Label
0,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
1,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
2,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
3,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope
4,3,0,0,0,1,3,1,0,2,0,...,1,1,4,0,5,0,0,0,0,Envelope


In [4]:
# Shape of the dataset
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

Rows: 4020, Columns: 27


In [5]:
# Info about column types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4020 entries, 0 to 4019
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4020 non-null   int64 
 1   B       4020 non-null   int64 
 2   C       4020 non-null   int64 
 3   D       4020 non-null   int64 
 4   E       4020 non-null   int64 
 5   F       4020 non-null   int64 
 6   G       4020 non-null   int64 
 7   H       4020 non-null   int64 
 8   I       4020 non-null   int64 
 9   J       4020 non-null   int64 
 10  K       4020 non-null   int64 
 11  L       4020 non-null   int64 
 12  M       4020 non-null   int64 
 13  N       4020 non-null   int64 
 14  O       4020 non-null   int64 
 15  P       4020 non-null   int64 
 16  Q       4020 non-null   int64 
 17  R       4020 non-null   int64 
 18  S       4020 non-null   int64 
 19  T       4020 non-null   int64 
 20  U       4020 non-null   int64 
 21  V       4020 non-null   int64 
 22  W       4020 non-null   

In [6]:
# List all column names
df.columns.tolist()

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'Label']

In [7]:
# Summary for numeric columns
df.describe()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
count,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,...,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0,4020.0
mean,38.308458,0.0,10.302985,25.533831,19.18607,27.981841,40.056468,8.512687,28.735572,0.0,...,28.290299,24.236318,42.600746,39.254726,0.0,33.449502,6.180597,0.036567,20.989552,0.0
std,40.851248,0.0,19.534525,32.612858,29.012472,35.826667,40.553769,12.224596,33.498266,0.0,...,28.313182,20.061522,45.158426,48.005088,0.0,52.571327,6.12426,1.014651,28.552608,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,0.0,5.0,7.0,11.0,14.0,4.0,14.0,0.0,...,3.0,14.0,15.0,13.0,0.0,8.0,5.0,0.0,9.0,0.0
50%,37.0,0.0,3.0,24.0,11.0,13.0,42.0,4.0,14.0,0.0,...,34.0,29.0,35.0,32.0,0.0,13.0,5.0,0.0,11.0,0.0
75%,37.0,0.0,8.0,24.0,11.0,14.0,42.0,7.0,20.0,0.0,...,35.0,29.0,36.0,32.0,0.0,20.0,7.0,0.0,15.0,0.0
max,488.0,0.0,227.0,389.0,340.0,351.0,412.0,145.0,345.0,0.0,...,239.0,244.0,457.0,527.0,0.0,599.0,78.0,54.0,336.0,0.0


In [8]:
# Summary for target column
df.describe(include='object')

Unnamed: 0,Label
count,4020
unique,4
top,Nucleocapsid
freq,1300


In [9]:
# Count of missing values per column
df.isnull().sum()

A        0
B        0
C        0
D        0
E        0
F        0
G        0
H        0
I        0
J        0
K        0
L        0
M        0
N        0
O        0
P        0
Q        0
R        0
S        0
T        0
U        0
V        0
W        0
X        0
Y        0
Z        0
Label    0
dtype: int64

In [10]:
# Unique values per column
df.nunique()

A        43
B         1
C        24
D        35
E        32
F        41
G        50
H        30
I        42
J         1
K        40
L        56
M        20
N        38
O         1
P        44
Q        38
R        35
S        54
T        40
U         1
V        41
W        16
X         9
Y        32
Z         1
Label     4
dtype: int64

In [11]:
# Count duplicate rows
df.duplicated().sum()

np.int64(3677)

In [12]:
# For categorical columns, plot value counts
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"\nValue Counts for: {col}")
    print(df[col].value_counts())


Value Counts for: Label
Label
Nucleocapsid    1300
Membrane        1022
Spike           1018
Envelope         680
Name: count, dtype: int64


## Visualizations

### 1. Bar Plot of Total Letter Frequencies Across Dataset

In [15]:
letter_columns = [col for col in df.columns if col != 'Label']
total_counts = df[letter_columns].sum().sort_values(ascending=False)

fig = px.bar(
    x=total_counts.index,
    y=total_counts.values,
    labels={'x': 'Letter', 'y': 'Total Count'},
    title='Total Frequency of Each Letter (A-Z)',
    color=total_counts.index,
    color_discrete_sequence=px.colors.sequential.Viridis
)
fig.show()

###  2. Boxplot of Each Letter Frequency

In [16]:
letter_columns = [col for col in df.columns if col != 'Label']

fig = go.Figure()

# Add one box plot per letter column
for col in letter_columns:
    fig.add_trace(go.Box(
        y=df[col],
        name=col,
        boxpoints='outliers'  # show outliers
    ))

fig.update_layout(
    title='Boxplot of Letter Frequencies (A-Z)',
    yaxis_title='Frequency',
    xaxis_title='Letter',
    xaxis_tickangle=-90,
    width=900,  # width similar to figsize
    height=450  # height similar to figsize
)

fig.show()

### 3. Correlation Heatmap Between Letters

In [17]:
letter_columns = [col for col in df.columns if col != 'Label']
corr_matrix = df[letter_columns].corr()

fig = px.imshow(
    corr_matrix,
    color_continuous_scale='RdBu_r',  # similar to 'coolwarm'
    title='Correlation Heatmap Between Letter Frequencies',
    labels=dict(x="Letter", y="Letter", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.index,
    zmin=-1,
    zmax=1
)

fig.update_layout(
    width=700,
    height=700
)

fig.show()

###  4. Countplot of Target Classes

In [18]:
fig = px.histogram(
    df,
    x='Label',
    color='Label',
    color_discrete_sequence=px.colors.qualitative.Set2,
    title='Target Class Distribution',
    labels={'Label': 'Target', 'count': 'Count'}
)

fig.update_layout(
    xaxis_title='Target',
    yaxis_title='Count',
    showlegend=False,
    width=600,
    height=400
)

fig.show()

### 5. Violin Plot: Frequency of Letter 'A' by Target Class

In [39]:
fig = px.violin(
    df,
    x='Label',
    y='A',
    color='Label',
    box=True,            # Show box plot inside violin
    points='all',        # Show all points
    color_discrete_sequence=px.colors.qualitative.Pastel,  # similar to seaborn 'muted'
    title="Distribution of 'A' by Target Class"
)

fig.update_layout(
    yaxis_title="A",
    xaxis_title="Label",
    width=800,
    height=500
)

fig.show()


### 6. Pairplot of Top 4 Frequent Letters Colored by Target

In [37]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

top_letters = total_counts.head(4).index.tolist()
labels = df['Label'].unique()
colors = ['red', 'green', 'blue', 'orange', 'purple']  # adjust as needed

n = len(top_letters)

fig = make_subplots(rows=n, cols=n,
                    shared_xaxes=False, shared_yaxes=False,
                    horizontal_spacing=0.02, vertical_spacing=0.02)

for i, x_col in enumerate(top_letters):
    for j, y_col in enumerate(top_letters):
        if i == j:
            # Diagonal: histogram (bar plot)
            for k, label in enumerate(labels):
                subset = df[df['Label'] == label][x_col]
                fig.add_trace(go.Histogram(
                    x=subset,
                    name=str(label),
                    marker_color=colors[k],
                    opacity=0.6,
                    showlegend=(i == 0 and j == 0),  # legend only once
                    nbinsx=20,
                ), row=i+1, col=j+1)
            # X-axis title only on bottom row
            fig.update_xaxes(title_text=x_col if i == n - 1 else '', row=i+1, col=j+1,
                             showticklabels=(i == n - 1))
            # Y-axis title on leftmost col
            # For first subplot (top-left), set to letter name; else blank
            yaxis_title = x_col if (i == 0 and j == 0) else ''
            fig.update_yaxes(title_text=yaxis_title if j == 0 else '', row=i+1, col=j+1,
                             showticklabels=(j == 0))
        else:
            # Off-diagonal: scatter plot
            for k, label in enumerate(labels):
                subset = df[df['Label'] == label]
                fig.add_trace(go.Scatter(
                    x=subset[y_col],
                    y=subset[x_col],
                    mode='markers',
                    marker=dict(color=colors[k], size=5),
                    name=str(label),
                    showlegend=False,
                ), row=i+1, col=j+1)
            # X-axis labels only bottom row
            fig.update_xaxes(title_text=y_col if i == n - 1 else '', row=i+1, col=j+1,
                             showticklabels=(i == n - 1))
            # Y-axis labels only leftmost col
            fig.update_yaxes(title_text=x_col if j == 0 else '', row=i+1, col=j+1,
                             showticklabels=(j == 0))

fig.update_layout(height=900, width=900, title_text='Custom Pairplot with Histogram Diagonals')

fig.show()


### 7. KDE Plot of Letter 'E' Across Target Classes

In [23]:
from scipy.stats import gaussian_kde
fig = go.Figure()

for label in df['Label'].unique():
    data = df[df['Label'] == label]['E']
    kde = gaussian_kde(data)
    x_range = np.linspace(data.min(), data.max(), 200)
    fig.add_trace(go.Scatter(
        x=x_range,
        y=kde(x_range),
        mode='lines',
        fill='tozeroy',
        name=f'Target {label}'
    ))

fig.update_layout(
    title="KDE Plot for Letter 'E' by Target",
    xaxis_title='Frequency of E',
    yaxis_title='Density',
    width=800,
    height=500
)

fig.show()

### 8. Heatmap of Mean Letter Frequencies Per Class

In [26]:
mean_by_class = df.groupby('Label')[letter_columns].mean()

fig = px.imshow(
    mean_by_class,
    color_continuous_scale='YlGnBu',
    text_auto='.1f',
    labels=dict(x='Letter', y='Target', color='Mean Frequency'),
    title='Mean Letter Frequency by Target Class',
    x=mean_by_class.columns,
    y=mean_by_class.index
)

fig.update_layout(
    width=900,
    height=600
)

fig.show()

### 9. Histogram of Letter 'T'

In [38]:
import plotly.graph_objects as go
import numpy as np
from scipy.stats import gaussian_kde

# Data for letter 'T'
data = df['T']

# Histogram
hist = go.Histogram(
    x=data,
    nbinsx=30,
    marker_color='coral',
    name='Histogram',
    opacity=0.7,
    histnorm=''  # raw counts
)

# KDE calculation
kde = gaussian_kde(data)
x_range = np.linspace(data.min(), data.max(), 200)
kde_vals = kde(x_range) * len(data) * (data.max()-data.min())/30  # scale KDE to histogram counts

kde_line = go.Scatter(
    x=x_range,
    y=kde_vals,
    mode='lines',
    line=dict(color='darkred'),
    name='KDE'
)

fig = go.Figure(data=[hist, kde_line])

fig.update_layout(
    title="Histogram of Letter 'T'",
    xaxis_title="Frequency of T",
    yaxis_title="Count",
    bargap=0.2,
    template='plotly_white',
    width=800,
    height=500
)

fig.show()
