In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Data Understanding

### Train data

In [8]:
class DataUnderstanding:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.data = None

    def load_data(self):
        # Load the dataset into a Pandas DataFrame
        self.data = pd.read_csv(self.dataset_path)

    def display_basic_info(self):
        if self.data is not None:
            # Display basic information about the dataset
            print("Dataset Info:")
            print(self.data.info())
            print("\nSample Data:")
            print(self.data.head())

    def check_missing_values(self):
        if self.data is not None:
            # Check for missing values
            missing_values = self.data.isnull().sum()
            print("\nMissing Values:")
            print(missing_values[missing_values > 0])

    def generate_summary_statistics(self):
        if self.data is not None:
            # Generate summary statistics for numeric variables
            numeric_summary = self.data.describe()
            print("\nSummary Statistics for Numeric Variables:")
            print(numeric_summary)


if __name__ == '__main__':
    dataset_path = r'E:\Projects\Digital-Green-Crop-Yield-Estimate-Challenge\Data\Train.csv'  
    data_understanding = DataUnderstanding(dataset_path)
    data_understanding.load_data()
    data_understanding.display_basic_info()
    data_understanding.check_missing_values()
    data_understanding.generate_summary_statistics()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  3870 non-null   object 
 1   District                            3870 non-null   object 
 2   Block                               3870 non-null   object 
 3   CultLand                            3870 non-null   int64  
 4   CropCultLand                        3870 non-null   int64  
 5   LandPreparationMethod               3870 non-null   object 
 6   CropTillageDate                     3870 non-null   object 
 7   CropTillageDepth                    3870 non-null   int64  
 8   CropEstMethod                       3870 non-null   object 
 9   RcNursEstDate                       3787 non-null   object 
 10  SeedingSowingTransplanting          3870 non-null   object 
 11  SeedlingsPerPit              


- The train dataset comprises 3,870 entries (rows) and 44 columns (variables).
- The data types of the columns are categorized as follows:
    - Numeric Data:
        - 14 columns are of type float64 (representing floating-point numeric values).
        - 7 columns are of type int64 (representing integer values).
    - Categorical Data:
        - 23 columns are of type object (representing text or categorical data).
- The description of each variable is given [here](https://github.com/kanevundi/Digital-Green-Crop-Yield-Estimate-Challenge/blob/master/Data/VariableDescription.csv)
- 23 columns in the dataset contain missing values. These include **RcNursEstDate** which has 83 missing values, "**Ganaura**" with a substantial 2,417 missing values, "**CropOrgFYM**" with 2,674 missing values, and "**TransplantingIrrigationPowerSource**" with 503 missing values. Additionally, other columns such as "**TransIrriCost**," "**StandingWater**," "**OrgFertilizers**," "**PCropSolidOrgFertAppMethod**," "**BasalDAP**," "**BasalUrea**," "**FirstTopDressFert**," "**1tdUrea**," "**1appDaysUrea**," "**2tdUrea**," "**2appDaysUrea**," and "**MineralFertAppMethod.1**" also contain missing data to varying degrees. Identifying and addressing missing values in these columns is essential for data quality and analysis.

- The dataset contains a number of numerical variables that provide information on crop yields and agricultural methods. For example, the "CultLand" column shows a great diversity in the areas of cultivated land, with an average area of about 28.53 acres and a substantial standard deviation of 30.45. A similar pattern can be seen in the "CropCultLand" column, which has a standard deviation of 27.99 and an average of roughly 24.73 acres. These figures point to a broad range of land sizes in the dataset that are utilized for crop cultivation. The "SeedlingsPerPit" column displays a rather high standard deviation of 7.62 and an average of 2.71, suggesting some fluctuation in the number of seedlings planted per pit. Additional numerical factors, such "BasalDAP," "StandingWater," and "TransplantingIrrigationHours," and "Yield," which show comparable patterns of variability and offer insightful information about farming methods and crop results.


### Test data

In [9]:
if __name__ == '__main__':
    dataset_path = r'E:\Projects\Digital-Green-Crop-Yield-Estimate-Challenge\Data\Test.csv'  
    data_understanding = DataUnderstanding(dataset_path)
    data_understanding.load_data()
    data_understanding.display_basic_info()
    data_understanding.check_missing_values()
    data_understanding.generate_summary_statistics()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 43 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  1290 non-null   object 
 1   District                            1290 non-null   object 
 2   Block                               1290 non-null   object 
 3   CultLand                            1290 non-null   int64  
 4   CropCultLand                        1290 non-null   int64  
 5   LandPreparationMethod               1290 non-null   object 
 6   CropTillageDate                     1290 non-null   object 
 7   CropTillageDepth                    1290 non-null   int64  
 8   CropEstMethod                       1290 non-null   object 
 9   RcNursEstDate                       1247 non-null   object 
 10  SeedingSowingTransplanting          1290 non-null   object 
 11  SeedlingsPerPit              

- The test dataset comprises an extensive 1,290 entries (rows) and 43 columns (variables).

- The data types of the columns are categorized as follows:
    - Numeric Data:
        - 14 columns are of type float64, representing floating-point numeric values.
        - 6 columns are of type int64, representing integer values.
    - Categorical Data:
        - 23 columns are of type object, representing text or categorical data.

- 23 columns in the dataset contain missing values, including **RcNursEstDate** with 43 missing values, **Ganaura** with a substantial 795 missing values, **CropOrgFYM** with 868 missing values, and **TransplantingIrrigationPowerSource** with 331 missing values. Additionally, other columns such as **TransIrriCost**, **StandingWater**, **OrgFertilizers**, **PCropSolidOrgFertAppMethod**, **BasalDAP**, **BasalUrea**, **FirstTopDressFert**, **1tdUrea**, **1appDaysUrea**, **2tdUrea**, and **2appDaysUrea** also contain missing data to varying degrees. Addressing these missing values is essential to ensure data quality for analysis.

- The dataset contains a number of numerical variables that provide information on crop yields and agricultural methods. For example, the "CultLand" column shows a wide diversity in the areas of cultivated land, with an average of approximately 29.20 acres and a significant standard deviation of 24.65. A similar pattern can be seen in the "CropCultLand" column, which has a standard deviation of 23.29 and an average of roughly 25.54 acres, indicating a broad range of land sizes in the dataset used for crop cultivation. The "SeedlingsPerPit" column displays some variability, with an average of approximately 2.60 seedlings planted per pit and a standard deviation of 2.05, indicating fluctuations in planting practices. Additional numerical variables, such as "BasalDAP," "StandingWater," and "TransplantingIrrigationHours," exhibit similar patterns of variability and offer valuable insights into farming practices and crop outcomes.

- The variable '**Yield**' is present in the training dataset but not in the test dataset. This indicates that 'Yield' is the target or dependent variable that we want to predict.

In [14]:
sample_submission.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,0
1,ID_SO3VW2X4QO93,0
2,ID_UKUQ7JM8E894,0
3,ID_QUISMWEZR2H4,0
4,ID_25JGI455VKCZ,0


In [15]:
# Split data for training and local testing

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

393.8160470653371

In [16]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = model.predict(test_df.fillna(0))

# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,671.4
1,ID_SO3VW2X4QO93,417.09
2,ID_UKUQ7JM8E894,493.95
3,ID_QUISMWEZR2H4,328.8
4,ID_25JGI455VKCZ,537.19
