## AZUBI CAPSTONE PROJECT

#### IMPORT LIBRARY PACKAGES

In [1]:
#Data Handling
import numpy as np
import pandas as pd

#Visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from termcolor import colored

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.preprocessing import LabelEncoder

# Evaluation Metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

#Other packages
import warnings
warnings.filterwarnings('always') 

#### Load the Data.

For this project, there are two datasets, namely:
* <b> Train data: </b> This is the data which we will be using to train the model. Since we are solving a classification problem, we will have a column in train dataset corresponding to the target labels. 
* <b> Test data: </b> This is the data on which the predictions will be made based on the model trained on train dataset. 

In [2]:
train_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Train.csv")
test_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Test.csv")

## EXPLORATORY DATA EXPLORATION

### EXPLORE THE TRAIN DATASET

In [3]:
#Overview of the train dataset.

train_data.head()

Unnamed: 0,ID,age,gender,education,class,education_institute,marital_status,race,is_hispanic,employment_commitment,...,country_of_birth_mother,migration_code_change_in_msa,migration_prev_sunbelt,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,old_residence_reg,old_residence_state,importance_of_record,income_above_limit
0,ID_TZ0000,79,Female,High school graduate,,,Widowed,White,All other,Not in labor force,...,US,?,?,?,?,,,,1779.74,Below limit
1,ID_TZ0001,65,Female,High school graduate,,,Widowed,White,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,2366.75,Below limit
2,ID_TZ0002,21,Male,12th grade no diploma,Federal government,,Never married,Black,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,1693.42,Below limit
3,ID_TZ0003,2,Female,Children,,,Never married,Asian or Pacific Islander,All other,Children or Armed Forces,...,India,unchanged,,unchanged,unchanged,Same,,,1380.27,Below limit
4,ID_TZ0004,70,Male,High school graduate,,,Married-civilian spouse present,White,All other,Not in labor force,...,US,?,?,?,?,,,,1580.79,Below limit


In [14]:
#Check the shape of the train dataset.

print(f"The shape of the train dataset is: {train_data.shape}")

The shape of the train dataset is: (209499, 14)


In [15]:
#Check the shape of the train dataset.

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209499 entries, 0 to 209498
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   age                    209499 non-null  int64 
 1   gender                 209499 non-null  object
 2   education              209499 non-null  object
 3   class                  104254 non-null  object
 4   marital_status         209499 non-null  object
 5   race                   209499 non-null  object
 6   hourly_wage            209499 non-null  int64 
 7   working_week_per_year  209499 non-null  int64 
 8   occupation             103805 non-null  object
 9   gains                  209499 non-null  int64 
 10  losses                 209499 non-null  int64 
 11  citizenship            209499 non-null  object
 12  country_of_birth       209499 non-null  object
 13  income_above_limit     209499 non-null  object
dtypes: int64(5), object(9)
memory usage: 22.4+ MB


In [17]:
train_data.isnull().sum()

age                           0
gender                        0
education                     0
class                    105245
marital_status                0
race                          0
hourly_wage                   0
working_week_per_year         0
occupation               105694
gains                         0
losses                        0
citizenship                   0
country_of_birth              0
income_above_limit            0
dtype: int64

#### INSIGHTS

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 105245 </b> missing values.
* Occupation code main columns with a total number of <b> 105694 </b> missing values.

In [16]:
train_data.describe()

Unnamed: 0,age,hourly_wage,working_week_per_year,gains,losses
count,209499.0,209499.0,209499.0,209499.0,209499.0
mean,34.518728,55.433487,23.15885,435.926887,36.881737
std,22.306738,276.757327,24.397963,4696.3595,270.383302
min,0.0,0.0,0.0,0.0,0.0
25%,15.0,0.0,0.0,0.0,0.0
50%,33.0,0.0,8.0,0.0,0.0
75%,50.0,0.0,52.0,0.0,0.0
max,90.0,9999.0,52.0,99999.0,4608.0


### EXPLORE THE TEST DATASET.

In [18]:
#Overview of the test dataset.

test_data.head()

Unnamed: 0,age,gender,education,class,marital_status,race,hourly_wage,working_week_per_year,occupation,gains,losses,citizenship,country_of_birth
0,54,Male,High school graduate,Private,Married-civilian spouse present,White,600,46,Transportation and material moving,0,0,Native,US
1,53,Male,5th or 6th grade,Private,Married-civilian spouse present,White,0,52,Machine operators assmblrs & inspctrs,0,0,Foreign born- Not a citizen of U S,El-Salvador
2,42,Male,Bachelors degree(BA AB BS),Private,Married-civilian spouse present,White,0,44,Professional specialty,15024,0,Native,US
3,16,Female,9th grade,,Never married,White,0,8,,0,0,Native,US
4,16,Male,9th grade,,Never married,White,0,0,,0,0,Native,US


In [19]:
#Check the shape of the test dataset.

print(f"The shape of the train dataset is: {test_data.shape}")

The shape of the train dataset is: (89786, 13)


In [20]:
#Check the shape of the test dataset.

test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89786 entries, 0 to 89785
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    89786 non-null  int64 
 1   gender                 89786 non-null  object
 2   education              89786 non-null  object
 3   class                  44707 non-null  object
 4   marital_status         89786 non-null  object
 5   race                   89786 non-null  object
 6   hourly_wage            89786 non-null  int64 
 7   working_week_per_year  89786 non-null  int64 
 8   occupation             44513 non-null  object
 9   gains                  89786 non-null  int64 
 10  losses                 89786 non-null  int64 
 11  citizenship            89786 non-null  object
 12  country_of_birth       89786 non-null  object
dtypes: int64(5), object(8)
memory usage: 8.9+ MB


In [21]:
test_data.isnull().sum()

age                          0
gender                       0
education                    0
class                    45079
marital_status               0
race                         0
hourly_wage                  0
working_week_per_year        0
occupation               45273
gains                        0
losses                       0
citizenship                  0
country_of_birth             0
dtype: int64

#### INSIGHTS FOR THE TEST DATASET

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 45079 </b> missing values.
* Occupation code main columns with a total number of <b> 45273 </b> missing values.

In [22]:
test_data.describe()

Unnamed: 0,age,hourly_wage,working_week_per_year,gains,losses
count,89786.0,89786.0,89786.0,89786.0,89786.0
mean,34.586294,54.338627,23.223932,421.977925,36.772648
std,22.345868,265.197545,24.418207,4610.515599,268.401257
min,0.0,0.0,0.0,0.0,0.0
25%,15.0,0.0,0.0,0.0,0.0
50%,33.0,0.0,8.0,0.0,0.0
75%,50.0,0.0,52.0,0.0,0.0
max,90.0,9400.0,52.0,99999.0,4608.0


## FEATURE ENGINEERING

In [23]:
#Create a copy of the original train dataset

data = train_data.copy() 

In [24]:
#Create a copy of the original test dataset

data = test_data.copy() 

In [5]:
#Overview of all the columns in the train dataset.

train_data.columns

Index(['ID', 'age', 'gender', 'education', 'class', 'education_institute',
       'marital_status', 'race', 'is_hispanic', 'employment_commitment',
       'unemployment_reason', 'employment_stat', 'wage_per_hour',
       'is_labor_union', 'working_week_per_year', 'industry_code',
       'industry_code_main', 'occupation_code', 'occupation_code_main',
       'total_employed', 'household_stat', 'household_summary',
       'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit',
       'tax_status', 'gains', 'losses', 'stocks_status', 'citizenship',
       'mig_year', 'country_of_birth_own', 'country_of_birth_father',
       'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg',
       'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record',
       'income_above_limit'],
      dtype='object')

In [6]:
#Overview of all the columns in the test dataset.

test_data.columns

Index(['ID', 'age', 'gender', 'education', 'class', 'education_institute',
       'marital_status', 'race', 'is_hispanic', 'employment_commitment',
       'unemployment_reason', 'employment_stat', 'wage_per_hour',
       'is_labor_union', 'working_week_per_year', 'industry_code',
       'industry_code_main', 'occupation_code', 'occupation_code_main',
       'total_employed', 'household_stat', 'household_summary',
       'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit',
       'tax_status', 'gains', 'losses', 'stocks_status', 'citizenship',
       'mig_year', 'country_of_birth_own', 'country_of_birth_father',
       'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg',
       'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'],
      dtype='object')

In [None]:
#Create a copy of the original test dataset

data = test_data.copy() 

### DROP SOME COLUMNS

#### For the purposes of this project, we will drop some columns that will not be relevant for the machine learning

In [8]:
#drop the following columns in the train dataset.

train_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

In [9]:
#drop the following columns in the test dataset.

test_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

#### RENAME SOME COLUMNS

In [10]:
# Define the new names for some columns to be renamed in the train dataset.

new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

train_data.rename(columns=new_columns, inplace=True)

In [11]:
# Define the new names for some columns to be renamed in the test dataset.

new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

test_data.rename(columns=new_columns, inplace=True)

In [12]:
#View the train dataset after some columns have been dropped and renamed.

print(train_data.columns)

Index(['age', 'gender', 'education', 'class', 'marital_status', 'race',
       'hourly_wage', 'working_week_per_year', 'occupation', 'gains', 'losses',
       'citizenship', 'country_of_birth', 'income_above_limit'],
      dtype='object')


In [13]:
#View the test dataset after some columns have been dropped and renamed.

print(test_data.columns)

Index(['age', 'gender', 'education', 'class', 'marital_status', 'race',
       'hourly_wage', 'working_week_per_year', 'occupation', 'gains', 'losses',
       'citizenship', 'country_of_birth'],
      dtype='object')


In [None]:
#Check the dataframe after some columns have been dropped and renamed.

train_data.head()

In [None]:
#Convert the income_above_limit column from a categorical data to a numeric data since that is the target .

le = LabelEncoder()
label = le.fit_transform(train_data['income_above_limit'])
train_data.drop("income_above_limit", axis=1, inplace=True)
train_data["income_above_limit"] = label

In [None]:
train_data.head()

In [None]:
train_data.hist(figsize=(12,9), layout=(3,3))

In [None]:

train_data.plot(kind = 'box', figsize=(10,5), layout=(3,2), );

In [None]:
sns.heatmap(train_data.corr(), annot=True);

#### CHECK FOR MISSING VALUES

In [None]:
#Check the remaining columns if there are any missing values.

train_data.isnull().sum()

In [None]:
train_data['gender'].unique()

In [None]:
train_data['gender'].value_counts()

In [None]:
train_data['education'].unique()

In [None]:
train_data['education'].value_counts()

In [None]:
#Renaming value in the Education column

train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('High school graduate','High school'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Doctorate degree(PhD EdD)','Doctorate'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Bachelors degree(BA AB BS)','Undergraduate'))

In [None]:
train_data['education'].unique()

In [None]:
train_data['class'].unique()

In [None]:
train_data['class'].value_counts()

In [None]:
train_data['marital_status'].unique()

In [None]:
train_data['marital_status'].value_counts()

In [None]:
train_data['race'].unique()

In [None]:
train_data['race'].value_counts()

In [None]:
train_data['occupation'].unique()

In [None]:
train_data['occupation'].value_counts()

In [None]:
#Renaming value in the occupation column

train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Adm support including clerical','Administration'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Executive admin and managerial','Executive and Managerial'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Transportation and material moving','Transport'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Technicians and related support','Technical'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Farming forestry and fishing','Agriculture'))

In [None]:
train_data['occupation'].unique()

In [None]:
def check_unique_values_and_value_counts(train_data, columns_to_check):
    """
    Check unique values and value counts of specific columns in a dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset to analyze.
    - columns_to_check (list): List of column names to check.

    Returns:
    - None
    """
    for column in columns_to_check:
        unique_values = train_data[column].unique()
        value_counts = train_data[column].value_counts()
        
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print("Value counts:")
        print(value_counts)
        print("-" * 40)

In [None]:
# Filter out only the numerical columns
numerical_columns = train_data.select_dtypes(include=['int', 'float'])

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create a boxplot using Seaborn
sns.boxplot(data=train_data)

plt.title("Boxplot of Multiple Columns")
plt.xlabel("Columns")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


#### Categorical Values

In [None]:
def plot_numerical_countplots(train_data):
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(data[column])
        plt.title(f'Countplot of {column}', fontsize=16)
        plt.xlabel(column, fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        plt.show()

# Example usage
# Assuming 'df' is your DataFrame
plot_numerical_countplots(train_data)


In [None]:
#def plot_countplots_for_categorical(train_data):
    #Get a list of column name with categorical data
    #categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
    
    #loops through categorical columns and create count plots
    #for columns in categorical_columns:
        
        #Set the figure size
       # plt.figure(figsize=(8,6))
        #sns.countplot(x=columns,data=train_data)
        
        #plt.xlabel(column)
        #Set x-axis label
       # plt.ylabel('Count')
        #Set y-axis label
        
       # plt.title(f'Count plot of {column}')
        #Set plot title
        
       # plt.xticks(rotation=45)
        #Rotate x-axis labels for readability
        
        #plt.show() 

In [None]:
#Example usage with your Dataframe 'df'

plot_countplots_for_categorical(train_data)

In [None]:
#Explore categorical variables with count plots

sns.countplot(x='gender', data = train_data)

In [None]:
#Univariate Analysis

train_data['column_name'].hist()
plt.xlabel('X-axis Label')
plt.xlabel('X-axis Label')
plt.title('Title')
plt.show()

In [None]:
#Bivariate Analysis

sns.scatterplot(x='column1', y='column2', data = train_data)

In [None]:
#categorical variables

sns.countplot(x='category_column', data=train_data)

In [None]:
train_data['categorical_column'].value_counts()

In [None]:
#train_data['education_institute'].fillna(0, inplace = True)
#print()