In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Data Understanding

In [8]:
class DataUnderstanding:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.data = None

    def load_data(self):
        # Load the dataset into a Pandas DataFrame
        self.data = pd.read_csv(self.dataset_path)

    def display_basic_info(self):
        if self.data is not None:
            # Display basic information about the dataset
            print("Dataset Info:")
            print(self.data.info())
            print("\nSample Data:")
            print(self.data.head())

    def check_missing_values(self):
        if self.data is not None:
            # Check for missing values
            missing_values = self.data.isnull().sum()
            print("\nMissing Values:")
            print(missing_values[missing_values > 0])

    def generate_summary_statistics(self):
        if self.data is not None:
            # Generate summary statistics for numeric variables
            numeric_summary = self.data.describe()
            print("\nSummary Statistics for Numeric Variables:")
            print(numeric_summary)


if __name__ == '__main__':
    dataset_path = r'E:\Projects\Digital-Green-Crop-Yield-Estimate-Challenge\Data\Train.csv'  
    data_understanding = DataUnderstanding(dataset_path)
    data_understanding.load_data()
    data_understanding.display_basic_info()
    data_understanding.check_missing_values()
    data_understanding.generate_summary_statistics()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  3870 non-null   object 
 1   District                            3870 non-null   object 
 2   Block                               3870 non-null   object 
 3   CultLand                            3870 non-null   int64  
 4   CropCultLand                        3870 non-null   int64  
 5   LandPreparationMethod               3870 non-null   object 
 6   CropTillageDate                     3870 non-null   object 
 7   CropTillageDepth                    3870 non-null   int64  
 8   CropEstMethod                       3870 non-null   object 
 9   RcNursEstDate                       3787 non-null   object 
 10  SeedingSowingTransplanting          3870 non-null   object 
 11  SeedlingsPerPit              

In [14]:
sample_submission.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,0
1,ID_SO3VW2X4QO93,0
2,ID_UKUQ7JM8E894,0
3,ID_QUISMWEZR2H4,0
4,ID_25JGI455VKCZ,0


In [15]:
# Split data for training and local testing

X = train.drop(['ID', 'Yield'], axis = 1)
X =X.select_dtypes(include=np.number)
y = train.Yield

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

# Instantiate model
model = RandomForestRegressor(random_state = 1234)

# Fit model
model.fit(X_train.fillna(0), y_train)

# Make predictions
preds = model.predict(X_test.fillna(0))

# Measure model performance
mean_squared_error(y_test, preds, squared=False)

393.8160470653371

In [16]:
# Make predictions on the Zindi test set
test_df = test[X.columns]
preds = model.predict(test_df.fillna(0))

# Create submisiion file to be uploaded to Zindi for scoring
sub = pd.DataFrame({'ID': test.ID, 'Yield': preds})
sub.to_csv('BenchmarkSubmission.csv', index = False)

sub.head()

Unnamed: 0,ID,Yield
0,ID_F9XXEXN2ADR2,671.4
1,ID_SO3VW2X4QO93,417.09
2,ID_UKUQ7JM8E894,493.95
3,ID_QUISMWEZR2H4,328.8
4,ID_25JGI455VKCZ,537.19
