# EDA ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go





data = pd.read_csv('crop_dataset.csv')

print("First 5 rows of the dataset:")

data.head()

In [None]:
data.isnull().sum()
data.describe()

In [None]:
print("\nMissing values per column:")
data.isnull().sum()

In [None]:
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
data['label'].value_counts()

In [None]:
columns = ['N', 'P', 'K', 'T2M_MAX-W', 'QV2M-W', 'Ph', 'PRECTOTCORR-W']
titles = [
    'Ratio of Nitrogen', 
    'Ratio of Phosphorous', 
    'Ratio of Potassium', 
    'Distribution of Temperature (Winter)', 
    'Distribution of Humidity (Winter)', 
    'Distribution of pH', 
    'Distribution of Rainfall (Winter)'
]
colors = [
    'royalblue', 
    'orange', 
    'green', 
    'red', 
    'purple', 
    'cyan', 
    'magenta'
]

fig = make_subplots(rows=4, cols=2, subplot_titles=titles)

for i, (col, title, color) in enumerate(zip(columns, titles, colors)):
    row = i // 2 + 1  
    col_idx = i % 2 + 1 
    
    histogram = go.Histogram(
        x=data[col],
        name=title,
        opacity=0.75,
        marker=dict(color=color),
        nbinsx=30  
    )
    
    fig.add_trace(histogram, row=row, col=col_idx)

fig.update_layout(
    title_text='Crop Dataset Distributions',
    height=1000,
    showlegend=False,
    barmode='overlay',
)

fig.update_traces(hoverinfo='x+y')

fig.show()


In [None]:
df = pd.read_csv("crop_dataset.csv")

parameters = {
    'N': 'Average Nitrogen Content Ratio',
    'P': 'Average Phosphorus Content',
    'K': 'Average Potassium Content',
    'PRECTOTCORR-W': 'Total Rainfall (Winter)',
    'T2M_MIN-W': 'Minimum Temperature (Winter)',
    'T2M_MAX-W': 'Maximum Temperature (Winter)',
    'QV2M-W': 'Humidity (Winter)'
}

for param, title in parameters.items():
    if param in df.columns and 'label' in df.columns:
        parameter_data = df.groupby('label')[param].mean().reset_index()

        max_crop = parameter_data.loc[parameter_data[param].idxmax()]

        averages = df.groupby('label')[list(parameters.keys())].mean().reset_index()
        averages['Overall_Average'] = averages[list(parameters.keys())].mean(axis=1)

        resilient_crop = averages.loc[averages['Overall_Average'].idxmax()]

        fig = px.bar(parameter_data,
                     x='label',
                     y=param,
                     title=title,
                     labels={param: title},
                     color=param,
                     color_continuous_scale=px.colors.sequential.Viridis)

        fig.update_layout(
            height=600,
            xaxis_title='Crops',
            yaxis_title=title,
            yaxis=dict(range=[0, parameter_data[param].max() + 0.05]),  
            xaxis_tickangle=-45  
        )

        fig.add_annotation(
            x=max_crop['label'],
            y=max_crop[param],
            text=f"Highest: {max_crop['label']}",
            showarrow=True,
            arrowhead=2,
            ax=20,
            ay=-40,
            font=dict(size=12, color="green"),
            bgcolor="lightgreen",
            bordercolor="green",
            borderwidth=1,
            borderpad=4,
            opacity=0.8
        )

        fig.add_annotation(
            x=resilient_crop['label'],
            y=resilient_crop[param],
            text=f"Resilient: {resilient_crop['label']}",
            showarrow=True,
            arrowhead=2,
            ax=-20,
            ay=40,
            font=dict(size=12, color="blue"),
            bgcolor="lightblue",
            bordercolor="blue",
            borderwidth=1,
            borderpad=4,
            opacity=0.8
        )

        fig.show()
    else:
        print(f"The required column '{param}' or 'label' is not present in the dataset.")


In [None]:
df = pd.read_csv("crop_dataset.csv")

soil_color_counts = df['Soilcolor'].value_counts().reset_index()
soil_color_counts.columns = ['Soil Color', 'Count']

fig1 = px.bar(soil_color_counts, x='Soil Color', y='Count',
              title='Crop Distribution by Soil Color',
              labels={'Count': 'Number of Crops'},
              color='Count',
              color_continuous_scale=px.colors.sequential.Viridis)

fig1.show()

fig2 = px.scatter(df, x='Ph', y='label', color='label',
                  title='pH Levels by Crop',
                  labels={'Ph': 'Soil pH', 'label': 'Crops'},
                  hover_data=['Ph'])

fig2.add_hline(y=6.0, line_dash="dash", line_color="red", annotation_text="Optimal pH Range", annotation_position="top right")
fig2.add_hline(y=7.5, line_dash="dash", line_color="red")

fig2.show()

nutrients = ['N', 'P', 'K', 'Zn', 'S']
for nutrient in nutrients:
    fig3 = px.box(df, x='label', y=nutrient,
                  title=f'{nutrient} Levels by Crop',
                  labels={'label': 'Crops', nutrient: f'{nutrient} Level'},
                  color='label')

    fig3.show()

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

df = pd.read_csv("crop_dataset.csv")

fig1 = px.scatter(df, x='QV2M-W', y='label', color='label',
                  title='Humidity Levels by Crop',
                  labels={'QV2M-W': 'Humidity (Winter)', 'label': 'Crops'},
                  hover_data=['QV2M-W'])

fig1.show()

fig2 = px.scatter(df, x='T2M_MAX-W', y='label', color='label',
                  title='Maximum Temperature by Crop',
                  labels={'T2M_MAX-W': 'Max Temperature (Winter)', 'label': 'Crops'},
                  hover_data=['T2M_MAX-W'])

fig2.add_hline(y=15, line_dash="dash", line_color="red", annotation_text="Optimal Temp Range", annotation_position="top right")
fig2.add_hline(y=30, line_dash="dash", line_color="red")

fig2.show()

fig3 = px.scatter(df, x='T2M_MIN-W', y='label', color='label',
                  title='Minimum Temperature by Crop',
                  labels={'T2M_MIN-W': 'Min Temperature (Winter)', 'label': 'Crops'},
                  hover_data=['T2M_MIN-W'])

fig3.add_hline(y=5, line_dash="dash", line_color="red", annotation_text="Optimal Temp Range", annotation_position="top right")
fig3.add_hline(y=20, line_dash="dash", line_color="red")

fig3.show()

rainfall_data = df.groupby('label')['PRECTOTCORR-W'].mean().reset_index()  # Average rainfall per crop

fig4 = px.bar(rainfall_data, x='label', y='PRECTOTCORR-W',
               title='Average Total Rainfall by Crop',
               labels={'PRECTOTCORR-W': 'Average Rainfall (Winter)'},
               color='PRECTOTCORR-W',
               color_continuous_scale=px.colors.sequential.Viridis)

fig4.show()