# EDA ANALYSIS

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('crop_dataset.csv')

print("First 5 rows of the dataset:")

data.head()

In [None]:
data.isnull().sum()
data.describe()

In [None]:
print("\nMissing values per column:")
data.isnull().sum()

In [None]:
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
data['label'].value_counts()

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

columns = ['N', 'P', 'K', 'T2M_MAX-W', 'QV2M-W', 'Ph', 'PRECTOTCORR-W']
titles = [
    'Ratio of Nitrogen', 
    'Ratio of Phosphorous', 
    'Ratio of Potassium', 
    'Distribution of Temperature (Winter)', 
    'Distribution of Humidity (Winter)', 
    'Distribution of pH', 
    'Distribution of Rainfall (Winter)'
]
colors = [
    'royalblue', 
    'orange', 
    'green', 
    'red', 
    'purple', 
    'cyan', 
    'magenta'
]

fig = make_subplots(rows=4, cols=2, subplot_titles=titles)

for i, (col, title, color) in enumerate(zip(columns, titles, colors)):
    row = i // 2 + 1  
    col_idx = i % 2 + 1 
    
    histogram = go.Histogram(
        x=data[col],
        name=title,
        opacity=0.75,
        marker=dict(color=color),
        nbinsx=30  
    )
    
    fig.add_trace(histogram, row=row, col=col_idx)

fig.update_layout(
    title_text='Crop Dataset Distributions',
    height=1000,
    showlegend=False,
    barmode='overlay',
)

fig.update_traces(hoverinfo='x+y')

fig.show()
