In [11]:
import pandas as pd 

def import_data(path):
    """
    importing the data into pandas dataframe

    """
    
    data = pd.read_csv(path)
    return data

df = import_data(r"C:\Users\user\Desktop\Crop Yield App\corn_data.csv")


def config_columns(data):
    """
    Configuring the columns such as stripping the excess spaces, and converting the columns to lower case
    """
    data.columns = data.columns.str.strip()
    data.columns = data.columns.str.lower()
    head = data.head(3)
    data = data.copy()
    return data


def inspect_data(data):
    """
    Inspecting the shape, structure, missing values and duplicated rows in the dataframe
    """
    print(f"The shape of the dataset")
    shape = data.shape
    print(shape)

    print()

    print(f"The structure and datatype of the dataset")
    structure = data.info()
    print(structure)

    print()

    print(f"The missing values in the dataset")
    missing_values = data.isna().sum()
    print(missing_values)

    print()

    print(f"The duplicated rows in the dataset:", data.duplicated().sum())
    
    



def clean_data(data):
    """
    Cleaning the data such as imputing 'unknown into all missing categorical columns and imputing the mean intothe missing numeric variables '
    """
    data = data.copy()
    for column in data:
        if data[column].dtypes == "object":
            data[column].fillna('unknown', inplace=True)
        elif data[column].dtypes != 'O':
            data[column].fillna(data[column].mean(), inplace=True)
        else:
            print(f"No Null value in the dataset")
    return data
 

def get_counts(data):
    """
    Exploring the data by checking the value counts of each variable
    """
    data = data.copy()
    for i in data:
        counts = data[i].value_counts()
        print(counts)




def split_data(data):
    """
    Splitting the data into independent variables and dependent variable
    """
    x = data.drop(['Yield'], axis=1)
    y = data['Yield']
    return x, y 



def train_set(a,b):
    """
    Training the data for modeling
    """
    from sklearn.model_selection import train_test_split
    x_train,x_test, y_train, y_test = train_test_split(a, b, test_size =0.25, random_state= 42)
    return x_train, x_test, y_train, y_test




def pipe(x):
    """
    Building a pipe that processes the data by scaling and transfroming the columns
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OrdinalEncoder
    from sklearn.compose import ColumnTransformer

    num_pipe = Pipeline([
        ('scaler', StandardScaler())
    ])

    cat_pipe = Pipeline([
        ('scaler', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    transformer = ColumnTransformer(
        transformers=[
            ('pipe1', num_pipe, x.select_dtypes(include=['int','float']).columns),
            ('pipe2', cat_pipe, x.select_dtypes(include=['O']).columns)
        ],
        remainder='drop',
        n_jobs=-1
    )
    return transformer




def models():
    """
    Building the model for prediction and evaluation of the model 
    """
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from xgboost import XGBRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    models = {
        'RF': RandomForestRegressor(),
        'XGB': XGBRegressor(),
        'LR': LinearRegression(),
        'GR': GradientBoostingRegressor(),
        'TREE': DecisionTreeRegressor()
    }
    
    from sklearn.pipeline import make_pipeline
    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
    for name, model in models.items():
        y_predict = make_pipeline(processor, model).fit(x_train, y_train).predict(x_test)
        print(f"The R2_Score of the {name} model is {r2_score(y_test, y_predict):.2f}")
        print(f"The MAE of the {name} model is: {mean_absolute_error(y_test, y_predict):.2f}")
        print(f"The MAPE of the {name} model is: {mean_absolute_percentage_error(y_test, y_predict):.2f}")

df = (config_columns(df))
inspect_data(df)
get_counts(clean_df)
x, y = split_data(clean_df)
x_train, x_test, y_train, y_test = train_set(x,y)
processor = pipe(x)
models()              

The shape of the dataset
(422, 22)

The structure and datatype of the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   county                422 non-null    object 
 1   farmer                422 non-null    object 
 2   education             396 non-null    object 
 3   gender                422 non-null    object 
 4   age bracket           422 non-null    object 
 5   household size        422 non-null    int64  
 6   crop                  422 non-null    object 
 7   acreage               351 non-null    float64
 8   fertilizer amount     422 non-null    int64  
 9   laborers              422 non-null    int64  
 10  yield                 422 non-null    int64  
 11  power source          422 non-null    object 
 12  water source          422 non-null    object 
 13  main credit source    422 non-null    object 
 