### AZUBI CAPSTONE PROJECT

#### IMPORT LIBRARY PACKAGES

In [None]:
#Data Handling
import numpy as np
import pandas as pd

#Visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from termcolor import colored

# Feature Processing (Scikit-learn processing, etc. )

# Evaluation Metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

#Other packages
import warnings
warnings.filterwarnings('always') 

#### Load the Data.

For this project, there are two datasets, namely:
* <b> Train data: </b> This is the data which we will be using to train the model. Since we are solving a classification problem, we will have a column in train dataset corresponding to the target labels. 
* <b> Test data: </b> This is the data on which the predictions will be made based on the model trained on train dataset. 

In [None]:
train_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Train.csv")

### EXPLORATORY DATA EXPLORATION

In [None]:
#Overview of the train dataset.

train_data.head()

In [None]:
#Overview of all the columns in the dataset.

train_data.columns

In [None]:
#Create a copy of the original data

data = train_data.copy() 

#### RENAME SOME COLUMNS

In [None]:
#drop multiple columns by name
train_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

#### RENAME SOME COLUMNS

In [None]:
# Define the new names for the columns to be renamed.
new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

train_data.rename(columns=new_columns, inplace=True)

In [None]:
#View dataset after some columns have been dropped and renamed.

print(train_data.columns)

In [None]:
#Check the dataframe after some columns have been dropped and renamed.

train_data.head()

In [None]:
#Check the shape of the dataset.
print(f"The shape of the train dataset is: {train_data.shape}")

In [None]:
#Check the shape of the dataset.

train_data.info()

#### CHECK FOR MISSING VALUES

In [None]:
#Check the remaining columns if there are any missing values.

train_data.isnull().sum()

#### INSIGHTS

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 105245 </b> missing values.
* Occupation code main columns with a total number of <b> 105694 </b> missing values.

In [None]:
train_data['gender'].unique()

In [None]:
train_data['gender'].value_counts()

In [None]:
train_data['education'].unique()

In [None]:
train_data['education'].value_counts()

In [None]:
#Renaming value in the Education column

train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('High school graduate','High school'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Doctorate degree(PhD EdD)','Doctorate'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Bachelors degree(BA AB BS)','Undergraduate'))

In [None]:
train_data['education'].unique()

In [None]:
train_data['class'].unique()

In [None]:
train_data['class'].value_counts()

In [None]:
train_data['marital_status'].unique()

In [None]:
train_data['marital_status'].value_counts()

In [None]:
train_data['race'].unique()

In [None]:
train_data['race'].value_counts()

In [None]:
train_data['occupation_code_main'].unique()

In [None]:
train_data['occupation_code_main'].value_counts()

In [None]:
#Renaming value in the Education column

train_data['occupation_code_main'] = train_data['occupation_code_main'].apply(lambda x : str(x).replace('High school graduate','High school'))
train_data['occupation_code_main'] = train_data['occupation_code_main'].apply(lambda x : str(x).replace('Doctorate degree(PhD EdD)','Doctorate'))
train_data['occupation_code_main'] = train_data['occupation_code_main'].apply(lambda x : str(x).replace('Bachelors degree(BA AB BS)','Undergraduate')

In [None]:
def check_unique_values_and_value_counts(train_data, columns_to_check):
    """
    Check unique values and value counts of specific columns in a dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset to analyze.
    - columns_to_check (list): List of column names to check.

    Returns:
    - None
    """
    for column in columns_to_check:
        unique_values = train_data[column].unique()
        value_counts = train_data[column].value_counts()
        
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print("Value counts:")
        print(value_counts)
        print("-" * 40)

In [None]:
train_data['gender'].unique()

In [None]:
train_data['gender'].value_counts()

In [None]:
train_data['education'].unique()

In [None]:
train_data['education'].value_counts()

In [None]:
#Drop unnecesarry columns from the dataset
train_data.drop(columns= ['ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason', 'employment_stat',
       'is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code','total_employed', 'household_stat', 
       'household_summary', 'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status',
       'mig_year', 'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'], axis=1, inplace=True)

In [None]:
print("The shape of test data is"{test_data})
print('The shape of test data is'{test_data})

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
train_data.hist(figsize=(12, 10))

In [None]:
# Filter out only the numerical columns
numerical_columns = train_data.select_dtypes(include=['int', 'float'])

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create a boxplot using Seaborn
sns.boxplot(data=train_data)

plt.title("Boxplot of Multiple Columns")
plt.xlabel("Columns")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


In [None]:
#Handling missing values


print(train_data.isnull().sum())

In [None]:
train_data.columns

In [None]:
def drop_columns(  train_data,  test_data, 'ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason', 'employment_stat',
       'is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code','total_employed', 'household_stat', 
       'household_summary', 'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status',
       'mig_year', 'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'):
    
    train_data = train_data.drop(columns=columns_to_drop, inplace=False)
    test_data = test_data.drop(columns=columns_to_drop, inplace=False)
    return train_data, test_data

#### Categorical Values

In [None]:
def plot_numerical_countplots(train_data):
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(data[column])
        plt.title(f'Countplot of {column}', fontsize=16)
        plt.xlabel(column, fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        plt.show()

# Example usage
# Assuming 'df' is your DataFrame
plot_numerical_countplots(train_data)


In [None]:
#def plot_countplots_for_categorical(train_data):
    #Get a list of column name with categorical data
    #categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
    
    #loops through categorical columns and create count plots
    #for columns in categorical_columns:
        
        #Set the figure size
       # plt.figure(figsize=(8,6))
        #sns.countplot(x=columns,data=train_data)
        
        #plt.xlabel(column)
        #Set x-axis label
       # plt.ylabel('Count')
        #Set y-axis label
        
       # plt.title(f'Count plot of {column}')
        #Set plot title
        
       # plt.xticks(rotation=45)
        #Rotate x-axis labels for readability
        
        #plt.show() 

In [None]:
#Example usage with your Dataframe 'df'

plot_countplots_for_categorical(train_data)

In [None]:
#Explore categorical variables with count plots

sns.countplot(x='gender', data = train_data)

In [None]:
#Univariate Analysis

train_data['column_name'].hist()
plt.xlabel('X-axis Label')
plt.xlabel('X-axis Label')
plt.title('Title')
plt.show()

In [None]:
#Bivariate Analysis

sns.scatterplot(x='column1', y='column2', data = train_data)

In [None]:
#categorical variables

sns.countplot(x='category_column', data=train_data)

In [None]:
train_data['categorical_column'].value_counts()

In [None]:
train_data('education_institute').unique()

In [None]:
#Print out unique values of every column in the dataset

def print_unique_values(train_data):
    for column in train_data.columns:
        unique_values = 
train_data[column].unique()
        print(f"Unique values in {columns}: {unique_values}" )

In [None]:
#train_data['education_institute'].fillna(0, inplace = True)
#print()