## Import all required libraries

In [2]:
import numpy as np
import pandas as pd
from helper_functions import getColumnsWithMissingData
from helper_functions import getLowerAndUpperBounds
from sklearn.preprocessing import LabelEncoder

## Handle Missing Data

In [3]:
# load data
data_ = pd.read_csv('DataScientist_01_Assessment.csv')

# find missing data
columns_with_missing_data = getColumnsWithMissingData(data=data_)
print(columns_with_missing_data)

Index(['AgricultureLand', 'business_number', 'Loan_from',
       'organic_pesticide_expenditure', 'food_banana_wilt_diseases'],
      dtype='object')


In [4]:
# drop loan from and organic_pesticide_expenditure 
# because the missingness is way to large
data_.drop(columns=['Loan_from', 'organic_pesticide_expenditure'], 
          inplace=True)

In [5]:
# Fill AgricultureLand with mean
agriculture_land = data_['AgricultureLand']
agriculture_land.fillna(value=agriculture_land.mean(), 
                        inplace=True)
data_['AgricultureLand'] = agriculture_land

In [6]:
# Fill business number with mode
data_ = data_[data_.business_number != '`']
data_['business_number'].value_counts()
business_number = data_['business_number']
business_number.fillna(value='0',
                       inplace=True)
data_['business_number'] = business_number

In [7]:
# drop food_banana_wilt_diseases
data_.drop(labels='food_banana_wilt_diseases', 
          axis=1,
          inplace=True)

# save a copy
data_.to_csv('data_without_missing_values.csv', index=False)


## Handle Outliers

In [8]:
data2_ = data_.copy(deep=True)

agriculture_land_bounds = getLowerAndUpperBounds(data=data2_,
                                                 column='AgricultureLand')
print(agriculture_land_bounds)

data2_ = data2_[data2_['AgricultureLand'] < agriculture_land_bounds[1]]

data2_.to_csv('data_without_outliers.csv', index=False)

(-0.9999999999999998, 3.8)


## Convert Categorical to numeric

In [9]:
data3_ = pd.read_csv('data_without_outliers.csv')

# Assuming df is your DataFrame
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_columns = data3_.select_dtypes(include=['object']).columns

# Apply Label Encoding
all_classes_ = []
for column in categorical_columns:
    data3_[column] = label_encoder.fit_transform(data3_[column])
    all_classes_.append(label_encoder.classes_)

data3_.to_csv(path_or_buf='categorized_data.csv', index=False)

In [51]:
data4_ = data3_.copy(deep=True)

# Define a function to categorize the values
def categorize_status(value):
    if value >= 2.15:
        return "On Track"
    elif value >= 1.77:
        return "At Risk"
    elif value >= 1.25:
        return "Struggling"
    else:
        return "Severely Struggling"

# Apply the function to create the new variable
data4_['ProgressStatus'] = data4_['HHIncome+Consumption+Residues/Day'].apply(categorize_status)

In [52]:
data4_.to_csv('data_with_progress.csv', index=False)

## Handle Imbalance

In [53]:
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'data4_' is your original DataFrame
X_imbalanced = data4_.drop('ProgressStatus', axis=1)
y_imbalanced = data4_['ProgressStatus']

# Apply Random Under-Sampling
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_imbalanced, y_imbalanced)

# Create a new DataFrame with resampled data
resampled_data_ = pd.DataFrame(X_resampled, columns=X_imbalanced.columns)
resampled_data_['ProgressStatus'] = y_resampled

data4_.to_csv('resampled_data.csv', index=False)


## Split the data

In [54]:
data5_ = resampled_data_

from sklearn.model_selection import train_test_split

# Define features and target
X = data4_.drop('ProgressStatus', axis=1)
y = data4_['ProgressStatus']

# into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Print shapes to confirm
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (2211, 72)
Validation set shape: (738, 72)
Testing set shape: (738, 72)


In [86]:
data5_.to_csv('cleaned_data.csv', index=False)