In [1]:
import numpy as np
import pandas as pd
from helper_functions import getColumnsWithMissingData
from helper_functions import getLowerAndUpperBounds
from sklearn.preprocessing import LabelEncoder

In [68]:
# load data
data_ = pd.read_csv('DataScientist_01_Assessment.csv')

# Fill business number with mode
data_ = data_[data_.business_number != '`']
business_number = data_['business_number']
business_number.value_counts().index

Index(['0', '1', '2'], dtype='object', name='business_number')

## Handle Missing Data

In [69]:
# load data
data_ = pd.read_csv('DataScientist_01_Assessment.csv')

# data info
# print(data_.info())

# find missing data
columns_with_missing_data = getColumnsWithMissingData(data=data_)
print(columns_with_missing_data)

# drop loan from and organic_pesticide_expenditure 
# because the missingness is way to large
data_.drop(columns=['Loan_from', 'organic_pesticide_expenditure'], 
          inplace=True)

# Fill AgricultureLand with mean
agriculture_land = data_['AgricultureLand']
agriculture_land.fillna(value=agriculture_land.mean(), 
                        inplace=True)
data_['AgricultureLand'] = agriculture_land

# Fill business number with mode
data_ = data_[data_.business_number != '`']
data_['business_number'].value_counts()
business_number = data_['business_number']
business_number.fillna(value='0',
                       inplace=True)
data_['business_number'] = business_number

# drop food_banana_wilt_diseases
data_.drop(labels='food_banana_wilt_diseases', 
          axis=1,
          inplace=True)

# save a copy
data_.to_csv('data_without_missing_values.csv', index=False)


Index(['AgricultureLand', 'business_number', 'Loan_from',
       'organic_pesticide_expenditure', 'food_banana_wilt_diseases'],
      dtype='object')


## Handle Outliers

In [49]:
data2_ = data_.copy(deep=True)
print(data2_)

agriculture_land_bounds = getLowerAndUpperBounds(data=data2_,
                                                 column='AgricultureLand')
print(agriculture_land_bounds)

data2_ = data2_[data2_['AgricultureLand'] < agriculture_land_bounds[1]]

data2_.to_csv('data_without_outliers.csv', index=False)

     District    Cluster             Village        HouseHoldID  \
0     Kanungu  Rutendere            Kangyeyo  KAN-KAN-FER-K1768   
1     Kanungu    Kihanda           Omurwambu  KAN-OMU-KAN-K4711   
2     Kanungu   Bujengwe              Byumba  KAN-BYU-ANN-K4055   
3     Kanungu   Bujengwe              Byumba  KAN-BYU-YUS-K4026   
4     Kanungu   Bujengwe              Kazahi  KAN-KAZ-FAU-K2057   
...       ...        ...                 ...                ...   
3892  Rubanda   Nyamweru  Mushongati_Nangara     RUB-MUS-M-2805   
3893  Rubanda   Nyamweru             Mirindi    RUB-MIR-M-23177   
3894  Rubanda    Mugyera           Muruhinga    RUB-MUR-F-34300   
3895  Rubanda   Nyamweru            Kakarisa     RUB-KAK-M-2217   
3896  Rubanda    Mugyera              Butusi     RUB-BUT-M-6739   

      HouseholdSize  TimeToOPD  TimeToWater  AgricultureLand  \
0                 3         60         35.0         0.250000   
1                 3        120         10.0         1.000000   
2  

## Convert Categorical to numeric

In [50]:
data3_ = pd.read_csv('data_without_outliers.csv')

# Assuming df is your DataFrame
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_columns = data3_.select_dtypes(include=['object']).columns
print(categorical_columns)

# Apply Label Encoding
all_classes_ = []
for column in categorical_columns:
    data3_[column] = label_encoder.fit_transform(data3_[column])
    all_classes_.append(label_encoder.classes_)

print(all_classes_)

data3_.to_csv(path_or_buf='categorized_data.csv', index=False)

Index(['District', 'Cluster', 'Village', 'HouseHoldID'], dtype='object')
[array(['Kanungu', 'Mitooma', 'Rubanda', 'Rukungiri'], dtype=object), array(['Bujengwe', 'Ibarya', 'Kacence', 'Kagati', 'Kahoko', 'Kihanda',
       'Kitahurira', 'Kitugunda', 'Kiyanga', 'Mugyera', 'Murama', 'Ngoma',
       'Nyamweru', 'Rutendere', 'Rwanja East', 'Rwoburunga'], dtype=object), array(['Bugarura', 'Buhumuriro', 'Bujerengye', 'Bukiriro_1', 'Bukiriro_2',
       'Burambo', 'Buranda', 'Burera', 'Bushure', 'Butoboore', 'Butusi',
       'Byumba', 'Ibarya', 'Iterero', 'Kababagi', 'Kabaranga', 'Kabingo',
       'Kabirizi', 'Kacence', 'Kacerere', 'Kafunjo', 'Kagaana',
       'Kagati_Kiyanga', 'Kagorogoro', 'Kagyeyo', 'Kahama', 'Kahara',
       'Kairabwa', 'Kakarisa', 'Kakimba', 'Kakyeza', 'Kamabare',
       'Kamuhozi', 'Kanganga', 'Kangyeyo', 'Kanoni', 'Kanyabutaye',
       'Kanyankwanzi', 'Karokarungi', 'Karondo', 'Karukare', 'Karukonjo',
       'Kashambya', 'Kashambya_Mugyera', 'Kashasha_Ikumba', 'Kataburaza

In [51]:
data4_ = data3_.copy(deep=True)

# Define a function to categorize the values
def categorize_status(value):
    if value >= 2.15:
        return "On Track"
    elif value >= 1.77:
        return "At Risk"
    elif value >= 1.25:
        return "Struggling"
    else:
        return "Severely Struggling"

# Apply the function to create the new variable
data4_['ProgressStatus'] = data4_['HHIncome+Consumption+Residues/Day'].apply(categorize_status)


In [52]:
data4_.to_csv('data_with_progress.csv', index=False)

## Handle Imbalance

In [53]:
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'data4_' is your original DataFrame
X_imbalanced = data4_.drop('ProgressStatus', axis=1)
y_imbalanced = data4_['ProgressStatus']

# Apply Random Under-Sampling
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_imbalanced, y_imbalanced)

# Create a new DataFrame with resampled data
resampled_data_ = pd.DataFrame(X_resampled, columns=X_imbalanced.columns)
resampled_data_['ProgressStatus'] = y_resampled

data4_.to_csv('resampled_data.csv', index=False)


## Split the data

In [54]:
data5_ = resampled_data_

from sklearn.model_selection import train_test_split

# Assuming 'data4_' is your DataFrame with 'ProgressStatus' column

# Define features and target
X = data4_.drop('ProgressStatus', axis=1)
y = data4_['ProgressStatus']

# into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Print shapes to confirm
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (2211, 72)
Validation set shape: (738, 72)
Testing set shape: (738, 72)


In [86]:
data5_.to_csv('cleaned_data.csv', index=False)