In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import warnings


In [None]:
warnings.filterwarnings('ignore')

In [None]:
# read data
data = pd.read_csv(r'children anemia.csv')

## WRANGLING

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['Anemia level'].fillna(data['Anemia level.1'], inplace=True)

In [None]:
msno.matrix(data)

In [None]:
print('Percentage of missing data')
round(data.isna().sum().sort_values(ascending=False)/data.shape[0],2)

In [None]:
def sep_num(num):
    """
    Function converts time values to minutes
    """
    try :
        if num.lower().startswith('immed'):
            return 1
        else:
            return int(float(num))//60
    except (ValueError,AttributeError):
        if num is not np.nan:
            res = num.split(':')
            if res[0].lower().startswith('hours'):
                return int(res[1])*60
            elif res[0].lower().startswith('day'):
                return int(res[1])*24*60 
        else:
            return num

In [None]:
data['When child put to breast'] = data['When child put to breast'].map(sep_num)

In [None]:
data.dtypes.value_counts()

In [None]:
data['Smokes cigarettes'].replace({'No':False,'Yes':True}, inplace=True)
data['Currently residing with husband/partner'].replace({'No':False,'Yes':True}, inplace=True)
data['Have mosquito bed net for sleeping (from household questionnaire)'].replace({'No':False,'Yes':True}, inplace=True)

In [None]:
data.head()

In [None]:
age_group = data.iloc[:,0].apply(lambda x: int(str(x).split('-')[1]) - int(str(x).split('-')[0]))

In [None]:
# display number of uique elements in the numeric columns in the dataframe
data.select_dtypes(exclude='number').nunique().sort_values()


In [None]:
data.rename(columns={'Type of place of residence':'residence type', 'Highest educational level':'education level',
                     'Smokes cigarettes':'smokes','Current marital status':'marital status',
                     'Taking iron pills, sprinkles or syrup':'takes supplements','Have mosquito bed net for sleeping (from household questionnaire)':'sleeps under mosquito net',
                     'Anemia level':'Anemia'},inplace=True)

In [None]:
# drop duplicate rows
data_non = data.drop_duplicates()

In [None]:
data_non.isna().sum().sort_values(ascending=False)

In [None]:
# fill all null values in the numerical data type category
data_non[data_non.select_dtypes('number').columns] = data_non.select_dtypes('number').apply(lambda x: x.fillna(round(np.mean(x),1)), axis=0)

In [None]:
data_non.isnull().sum()

In [None]:
data_non = data_non.dropna(axis=0)

In [None]:
# reset dataframe index
data_non.reset_index(drop = True, inplace= True)

In [None]:
# forward and backward fill of other null values
data_non = data_non.ffill(limit=3).bfill(limit=3)

In [None]:
# check if any value is missing
data_non.isnull().sum().sum()

In [None]:
# count values of all columns in the dataframe
print('Number of unique values in each column:\n')
for col in data_non.columns:
    print(f'{col} = {data_non[col].nunique():<10}')

In [None]:
# drop column 'When child put to breast' because it is not labelled properly
data_non.drop(['When child put to breast','Anemia level.1'],axis= 1,inplace=True)

In [None]:
# rename and get the average of the age groups
data_non['Age in 5-year groups'] = data_non["Age in 5-year groups"].apply(lambda x:sum(map(int,x.split('-')))/2).astype(int)
data_non.rename(columns= {'Age in 5-year groups':'Age average'}, inplace = True)

In [None]:
data_non['Anemia'] = data_non['Anemia'].apply(lambda x : False if x == 'Not anemic' else not False)

In [None]:
age_anemia_counts = pd.crosstab(index=data_non['Age of respondent at 1st birth'], columns=data_non['Anemia'])
age_anemia_counts

In [None]:
fig = plt.Figure((30,30))
age_anemia_counts.plot(kind='bar', stacked=False)
plt.xlabel('Age at 1st birth')
plt.ylabel('Count')
plt.title('Anemia by Age at 1st birth')
plt.xticks(rotation=45)
plt.show()

In [None]:
data_non.tail()

In [None]:
data_non['Type of place of residence'].replace({'Urban':1,'Rural':0})

In [None]:
# difference of respondents age to the age at first birth
diff_age = np.abs(np.where(data_non['Age average'] - data_non['Age of respondent at 1st birth']==0, 1,(data_non['Age average'] - data_non['Age of respondent at 1st birth'])))
# checks for respondents rate of birth
data_non['avg_child_year'] = np.ceil(diff_age/data_non['Births in last five years']).astype(int)

sns.scatterplot(data_non, x='Births in last five years', y='avg_child_year', hue='Anemia')

In [None]:
wealth_avg_birth.groupby('Wealth index combined').mean()

In [None]:
wealth_n_condition = data_non[['Wealth index combined','Anemia']]
ax = wealth_n_condition.groupby(['Wealth index combined','Anemia']).size().unstack().plot(kind='bar',)
ax.legend(loc=4, bbox_to_anchor=(1.4,.0))

In [None]:
sns.scatterplot(data_non,x='Wealth index combined',y='avg_child_year', hue='Anemia')

In [None]:
wealth_groups = wealth_avg_birth.groupby('Wealth index combined')

In [None]:
fig, axes = plt.subplots(3,2,figsize=(10,12))
fig.set_alpha(0.5)
axes = axes.flatten()
for index,key in enumerate(list(wealth_groups.groups.keys())):
    wealth_groups.get_group(key).plot(kind='box',ax=axes[index],label = key)
    axes[index].set_title(key)
axes[-1].axis('off')
plt.tight_layout()

In [None]:
# reset dataframe index
data_non.reset_index(drop = True, inplace= True)

In [None]:
# selected histogram plots
fig,axes = plt.subplots(2,2,figsize=(10,6))
axes = axes.flatten()

data_non['Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)'].plot.hist(bins = 5,ax = axes[0], title = f'Hemoglobin level adjusted for \n altitude and smoking')
data_non['Hemoglobin level adjusted for altitude (g/dl - 1 decimal)'].plot.hist(bins = 5, ax = axes[1], title = 'Hemoglobin level adjusted for altitude')
data_non['Age average'].plot.hist(bins=5, ax = axes[2], title = 'Age average')
data_non['Age of respondent at 1st birth'].plot.hist(bins = 5, ax = axes[3],title = "Age of respondent at first birth")
plt.tight_layout()

In [None]:
# check Anemia values proportion
data_non['Anemia level'].value_counts(normalize=True)
# Propotion of target is not balanced

In [None]:
fig,axes = plt.subplots(4,4, figsize = (20,15))
axes = axes.flatten()
axes[-1].axis('off')
for col,ax in zip(data_non.columns,axes):
    sns.countplot(data_non, x = col, ax = ax)
    value_count = data_non[col].value_counts()
plt.tight_layout()

In [None]:
sns.catplot(data=data_non,x='Age average',y='Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)',hue='Anemia')

In [None]:
# Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal) is a strong factor 
# as it shows if a person will have Anemia or not

In [None]:
sns.scatterplot(data=data_non,x='Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)',y='Hemoglobin level adjusted for altitude (g/dl - 1 decimal)',hue='Anemia')

In [None]:
# check Anemia values proportion
data_non['Anemia'].value_counts(normalize=True)

In [None]:
data_non.dtypes

In [None]:
data_non.head()

In [None]:
num_cols = ['Births in last five years','Hemoglobin level adjusted for altitude (g/dl - 1 decimal)','Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)','Age of respondent at 1st birth']
cat_cols = [col for col in data_non.columns if col not in num_cols and col != 'Anemia level']
cat_cols

## MODEL

In [None]:
num_cols = ['Births in last five years','Hemoglobin level adjusted for altitude (g/dl - 1 decimal)','Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)','Age of respondent at 1st birth']
cat_cols = [col for col in data_non.columns if col not in num_cols and col not in ['Anemia','Anemia level.1']]
cat_cols

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
le = LabelEncoder()
ss = StandardScaler()

In [None]:
data_non.isna().sum()

In [None]:
# variables for data modelling
X = data_non.drop(['Anemia','Anemia level.1'], axis = 1)
y = le.fit_transform(data_non['Anemia'])


In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True, random_state = 12, stratify=y)

In [None]:
cat_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
num_transformer = make_pipeline(StandardScaler())
preprocessor = ColumnTransformer([('categories',cat_transformer, cat_cols),('numerical',num_transformer,num_cols)])

In [None]:
models = {'rf':RandomForestClassifier(),'log_regression':LogisticRegression(),'knn':KNeighborsClassifier()}

In [None]:
model_res = {}
for name,model in models.items():
    pipe = Pipeline([('preprocessor', preprocessor),(name,model)]) 
    model_res[name] = pipe.fit(X_train,y_train)
    print(f"{model} done !")


In [None]:
for name,model in model_res.items():
    print(f'{name:20s}: {model.score(X_test,y_test)}')

In [None]:
y_pred = model_res['rf'].predict(X_test)

In [None]:
cm_rf = confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'estimator__max_depth': [10, 20, 30],  # Maximum depth of the tree
    'estimator__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'estimator__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'estimator__bootstrap': [True, False]  # Method for sampling data points
}
grid_pipe = Pipeline([('preprocessor',preprocessor),('estimator',RandomForestClassifier(random_state=11))])
rf_grid = GridSearchCV(grid_pipe, param_grid=param_grid,n_jobs=-1,cv=5, error_score='raise')
rf_grid.fit(X_train,y_train)

In [None]:
rf_grid.score(X_test,y_test)

In [None]:
best_rf_grid = rf_grid.best_estimator_[1].get_params()

In [None]:
rf_pipe = make_pipeline(preprocessor,RandomForestClassifier(**best_rf_grid))
rf_pipe.fit(X_train,y_train)

In [None]:
best_rf_grid = rf_grid.best_params_
best_rf_grid

In [None]:
y_pred = rf_pipe.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(cm, annot = True,fmt = 'd',xticklabels= le.classes_, yticklabels= le.classes_, cmap=sns.color_palette("plasma"),linecolor='green',linewidths=.5)
plt.title('Confusion Matrix of Children Anemic Level Model')
plt.ylabel('True Label')
plt.xlabel('Prediction Label')
plt.show()

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
feature_names = [x.split('__')[1] for x in preprocessor.get_feature_names_out()]

In [None]:
rf_model_step = rf_pipe.steps[1][1]
feat_imp = pd.DataFrame({'feature':feature_names, 'importances':rf_model_step.feature_importances_}).sort_values(by='importances', ascending=False)
feat_imp

In [None]:
sns.barplot(data = feat_imp.iloc[:10],y= 'feature', x='importances').set_title('Top10 Feature Importances')