In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Loading and Initial Inspection
df = pd.read_csv('climate_action_data.csv')

print("Initial Dataset Shape:", df.shape)
print("\nInitial Data Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

Initial Dataset Shape: (913, 10)

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Sensor_ID                      913 non-null    object
 1   Date                           909 non-null    object
 2   Soil_Moisture(%)               913 non-null    object
 3   Soil_pH                        913 non-null    object
 4   Temperature(C)                 913 non-null    object
 5   Humidity(%)                    913 non-null    object
 6   Crop_Type                      909 non-null    object
 7   Fertilizer_Recommended(kg/ha)  913 non-null    object
 8   Irrigation_Recommended(mm)     913 non-null    object
 9   Drone_Image_ID                 913 non-null    object
dtypes: object(10)
memory usage: 71.5+ KB
None

First few rows:
  Sensor_ID        Date Soil_Moisture(%) Soil_pH Temperature(C) Humidi

In [2]:
# 2. Data Cleaning

# Replace 'error' with NaN
df = df.replace('error', np.nan)

# Convert columns to appropriate data types
numeric_columns = ['Soil_Moisture(%)', 'Soil_pH', 'Temperature(C)', 'Humidity(%)', 
                  'Fertilizer_Recommended(kg/ha)', 'Irrigation_Recommended(mm)']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove duplicates
df = df.drop_duplicates()

# Handle missing values
# Fill numeric columns with median values
for col in numeric_columns:
    df[col] = df[col].fillna(df[col].median())

print("Dataset shape after cleaning:", df.shape)
print("\nMissing values after cleaning:")
print(df.isnull().sum())

Dataset shape after cleaning: (802, 10)

Missing values after cleaning:
Sensor_ID                        0
Date                             4
Soil_Moisture(%)                 0
Soil_pH                          0
Temperature(C)                   0
Humidity(%)                      0
Crop_Type                        4
Fertilizer_Recommended(kg/ha)    0
Irrigation_Recommended(mm)       0
Drone_Image_ID                   0
dtype: int64


In [3]:
# 3. Exploratory Data Analysis

# Descriptive statistics
print("Descriptive Statistics:")
print(df.describe())

# Create histograms for numeric variables
plt.figure(figsize=(15, 10))
df[numeric_columns].hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.savefig('numeric_distributions.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df[numeric_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

Descriptive Statistics:
       Soil_Moisture(%)     Soil_pH  Temperature(C)  Humidity(%)  \
count        802.000000  802.000000      802.000000   802.000000   
mean          44.767756    6.248865       26.461471    60.034539   
std           20.460731    1.021573        4.863302    17.416023   
min           10.350000    4.520000       18.000000    30.000000   
25%           26.947500    5.330000       22.300000    46.025000   
50%           45.410000    6.260000       26.600000    59.600000   
75%           62.555000    7.080000       30.600000    75.275000   
max           79.980000    8.000000       35.000000    89.900000   

       Fertilizer_Recommended(kg/ha)  Irrigation_Recommended(mm)  
count                     802.000000                  802.000000  
mean                       87.456983                   17.392269  
std                        34.765654                    7.140127  
min                        30.000000                    5.000000  
25%                        5

<Figure size 1500x1000 with 0 Axes>

In [4]:
# 4. Analysis of Key Metrics

# Influence on fertilizer recommendations
correlations = df[numeric_columns].corr()['Fertilizer_Recommended(kg/ha)'].sort_values(ascending=False)
print("\nCorrelations with Fertilizer Recommendations:")
print(correlations)

# Average soil moisture by crop type
avg_moisture = df.groupby('Crop_Type')['Soil_Moisture(%)'].mean().sort_values(ascending=False)
print("\nAverage Soil Moisture by Crop Type:")
print(avg_moisture)

# Irrigation analysis for high temperatures
high_temp_irrigation = df[df['Temperature(C)'] > 30].groupby('Crop_Type')['Irrigation_Recommended(mm)'].mean()
print("\nAverage Irrigation for Crops in High Temperature (>30°C):")
print(high_temp_irrigation)


Correlations with Fertilizer Recommendations:
Fertilizer_Recommended(kg/ha)    1.000000
Soil_pH                          0.083672
Humidity(%)                      0.015307
Soil_Moisture(%)                -0.002767
Temperature(C)                  -0.007215
Irrigation_Recommended(mm)      -0.014122
Name: Fertilizer_Recommended(kg/ha), dtype: float64

Average Soil Moisture by Crop Type:
Crop_Type
Wheat       47.185349
Maize       44.955032
Beans       44.395855
Tomatoes    43.819286
Lettuce     42.986358
Name: Soil_Moisture(%), dtype: float64

Average Irrigation for Crops in High Temperature (>30°C):
Crop_Type
Beans       17.857692
Lettuce     16.297297
Maize       16.625455
Tomatoes    18.919565
Wheat       18.811429
Name: Irrigation_Recommended(mm), dtype: float64


In [5]:
# 5. Export cleaned dataset and generate insights report
df.to_csv('cleaned_precision_agriculture_data.csv', index=False)

# Generate insights report
insights = """
AgriSmart AI Data Analysis Insights:

1. Data Quality:
   - Initial dataset: 913 records
   - After cleaning: {cleaned_records} records
   - Removed duplicates and handled missing values

2. Key Findings:
   - Most influential factors for fertilizer recommendations:
     {fertilizer_correlations}
   
   - Crop type with highest soil moisture:
     {top_moisture_crop}
   
   - Irrigation recommendations for high temperatures:
     {high_temp_recommendations}

3. Recommendations:
   a) Soil Management:
      - Focus on maintaining optimal soil moisture levels
      - Monitor pH levels regularly
   
   b) Irrigation Strategy:
      - Adjust irrigation for crops in high-temperature conditions
      - Implement precision irrigation based on crop-specific needs
   
   c) Fertilization Optimization:
      - Consider soil moisture and pH when planning fertilization
      - Customize fertilizer applications by crop type

4. Next Steps:
   - Implement automated monitoring system
   - Develop crop-specific irrigation schedules
   - Create early warning system for extreme conditions
""".format(
    cleaned_records=len(df),
    fertilizer_correlations=correlations.head(3).to_string(),
    top_moisture_crop=avg_moisture.head(1).to_string(),
    high_temp_recommendations=high_temp_irrigation.to_string()
)

# Save insights to file
with open('agricultural_insights.txt', 'w') as f:
    f.write(insights)

print("Analysis complete. Results saved to files:")
print("1. cleaned_precision_agriculture_data.csv")
print("2. numeric_distributions.png")
print("3. correlation_heatmap.png")
print("4. agricultural_insights.txt")

Analysis complete. Results saved to files:
1. cleaned_precision_agriculture_data.csv
2. numeric_distributions.png
3. correlation_heatmap.png
4. agricultural_insights.txt
