### AZUBI CAPSTONE PROJECT

#### IMPORT LIBRARY PACKAGES

In [1]:
#Data Handling
import numpy as np
import pandas as pd

#Visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from termcolor import colored

# Feature Processing (Scikit-learn processing, etc. )

# Evaluation Metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

#Other packages
import warnings
warnings.filterwarnings('always') 

#### Load the Data.

For this project, there are two datasets, namely:
* <b> Train data: </b> This is the data which we will be using to train the model. Since we are solving a classification problem, we will have a column in train dataset corresponding to the target labels. 
* <b> Test data: </b> This is the data on which the predictions will be made based on the model trained on train dataset. 

In [2]:
train_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Train.csv")
test_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Test.csv")

#### BASIC DATA EXPLORATION

In [3]:
train_data.head()

Unnamed: 0,ID,age,gender,education,class,education_institute,marital_status,race,is_hispanic,employment_commitment,...,country_of_birth_mother,migration_code_change_in_msa,migration_prev_sunbelt,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,old_residence_reg,old_residence_state,importance_of_record,income_above_limit
0,ID_TZ0000,79,Female,High school graduate,,,Widowed,White,All other,Not in labor force,...,US,?,?,?,?,,,,1779.74,Below limit
1,ID_TZ0001,65,Female,High school graduate,,,Widowed,White,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,2366.75,Below limit
2,ID_TZ0002,21,Male,12th grade no diploma,Federal government,,Never married,Black,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,1693.42,Below limit
3,ID_TZ0003,2,Female,Children,,,Never married,Asian or Pacific Islander,All other,Children or Armed Forces,...,India,unchanged,,unchanged,unchanged,Same,,,1380.27,Below limit
4,ID_TZ0004,70,Male,High school graduate,,,Married-civilian spouse present,White,All other,Not in labor force,...,US,?,?,?,?,,,,1580.79,Below limit


In [4]:
train_data.shape

(209499, 43)

In [20]:
#Create a copy of the original data
data = train_data.copy() 
data1 = test_data.copy()

In [23]:
def drop_columns_from_datasets(train_data, test_data):
    # Let's clean up these datasets by dropping some columns.
    
    columns_to_drop = ['ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason',
                       'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                       'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                       'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                       'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                       'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record']
    
    modified_dataset1 = train_data.drop(columns=columns_to_drop, axis=1, inplace=False)
    modified_dataset2 = test_data.drop(columns=columns_to_drop, axis=1, inplace=False)
    
    return modified_dataset1, modified_dataset2

In [22]:
train_data.head()

Unnamed: 0,ID,age,gender,education,class,education_institute,marital_status,race,is_hispanic,employment_commitment,...,country_of_birth_mother,migration_code_change_in_msa,migration_prev_sunbelt,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,old_residence_reg,old_residence_state,importance_of_record,income_above_limit
0,ID_TZ0000,79,Female,High school graduate,,,Widowed,White,All other,Not in labor force,...,US,?,?,?,?,,,,1779.74,Below limit
1,ID_TZ0001,65,Female,High school graduate,,,Widowed,White,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,2366.75,Below limit
2,ID_TZ0002,21,Male,12th grade no diploma,Federal government,,Never married,Black,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,1693.42,Below limit
3,ID_TZ0003,2,Female,Children,,,Never married,Asian or Pacific Islander,All other,Children or Armed Forces,...,India,unchanged,,unchanged,unchanged,Same,,,1380.27,Below limit
4,ID_TZ0004,70,Male,High school graduate,,,Married-civilian spouse present,White,All other,Not in labor force,...,US,?,?,?,?,,,,1580.79,Below limit


In [26]:
print(f"The shape of the train dataset is: {train_data.shape}")
print(f"The shape of the test dataset is: {test_data.shape}")

The shape of the train dataset is: (209499, 43)
The shape of the test dataset is: (89786, 42)


In [32]:
#### Check the data types of the datasets

print(f"The data types of the train dataset is: {train_data.dtypes}")
print()
print()
print(f"The data types of the test dataset is: {test_data.dtypes}")

The data types of the train dataset is: ID                                 object
age                                 int64
gender                             object
education                          object
class                              object
education_institute                object
marital_status                     object
race                               object
is_hispanic                        object
employment_commitment              object
unemployment_reason                object
employment_stat                     int64
wage_per_hour                       int64
is_labor_union                     object
working_week_per_year               int64
industry_code                       int64
industry_code_main                 object
occupation_code                     int64
occupation_code_main               object
total_employed                      int64
household_stat                     object
household_summary                  object
under_18_family                    o

In [34]:
#Check for missing values

train_data.isnull().sum()

ID                                     0
age                                    0
gender                                 0
education                              0
class                             105245
education_institute               196197
marital_status                         0
race                                   0
is_hispanic                            0
employment_commitment                  0
unemployment_reason               202979
employment_stat                        0
wage_per_hour                          0
is_labor_union                    189420
working_week_per_year                  0
industry_code                          0
industry_code_main                     0
occupation_code                        0
occupation_code_main              105694
total_employed                         0
household_stat                         0
household_summary                      0
under_18_family                   151654
veterans_admin_questionnaire      207415
vet_benefit     

In [47]:
import pandas as pd

def check_unique_values_and_value_counts(dataset, columns_to_check):
    """
    Check unique values and value counts of specific columns in a dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset to analyze.
    - columns_to_check (list): List of column names to check.

    Returns:
    - None
    """
    for column in columns_to_check:
        unique_values = train_data[column].unique()
        value_counts = dataset[column].value_counts()
        
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print("Value counts:")
        print(value_counts)
        print("-" * 40)

# Example usage
# Replace with your actual dataset and columns
# check_unique_values_and_value_counts(your_dataset, ['Column1', 'Column2'])


In [37]:
train_data['gender'].unique()

array([' Female', ' Male'], dtype=object)

In [38]:
train_data['gender'].value_counts()

gender
 Female    108784
 Male      100715
Name: count, dtype: int64

In [39]:
train_data['education'].unique()

array([' High school graduate', ' 12th grade no diploma', ' Children',
       ' Bachelors degree(BA AB BS)', ' 7th and 8th grade', ' 11th grade',
       ' 9th grade', ' Masters degree(MA MS MEng MEd MSW MBA)',
       ' 10th grade', ' Associates degree-academic program',
       ' 1st 2nd 3rd or 4th grade', ' Some college but no degree',
       ' Less than 1st grade', ' Associates degree-occup /vocational',
       ' Prof school degree (MD DDS DVM LLB JD)', ' 5th or 6th grade',
       ' Doctorate degree(PhD EdD)'], dtype=object)

In [40]:
train_data['education'].value_counts()

education
 High school graduate                      50627
 Children                                  49685
 Some college but no degree                29320
 Bachelors degree(BA AB BS)                20979
 7th and 8th grade                          8438
 10th grade                                 7905
 11th grade                                 7260
 Masters degree(MA MS MEng MEd MSW MBA)     6861
 9th grade                                  6540
 Associates degree-occup /vocational        5650
 Associates degree-academic program         4494
 5th or 6th grade                           3542
 12th grade no diploma                      2282
 1st 2nd 3rd or 4th grade                   1917
 Prof school degree (MD DDS DVM LLB JD)     1852
 Doctorate degree(PhD EdD)                  1318
 Less than 1st grade                         829
Name: count, dtype: int64

In [None]:
#Drop unnecesarry columns from the dataset
train_data.drop(columns= ['ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason', 'employment_stat',
       'is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code','total_employed', 'household_stat', 
       'household_summary', 'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status',
       'mig_year', 'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'], axis=1, inplace=True)

In [None]:
print("The shape of test data is"{test_data})
print('The shape of test data is'{test_data})

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
train_data.hist(figsize=(12, 10))

In [None]:
# Filter out only the numerical columns
numerical_columns = train_data.select_dtypes(include=['int', 'float'])

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create a boxplot using Seaborn
sns.boxplot(data=train_data)

plt.title("Boxplot of Multiple Columns")
plt.xlabel("Columns")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


In [None]:
#Handling missing values


print(train_data.isnull().sum())

In [None]:
train_data.columns

In [None]:
def drop_columns(  train_data,  test_data, 'ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason', 'employment_stat',
       'is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code','total_employed', 'household_stat', 
       'household_summary', 'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status',
       'mig_year', 'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'):
    
    train_data = train_data.drop(columns=columns_to_drop, inplace=False)
    test_data = test_data.drop(columns=columns_to_drop, inplace=False)
    return train_data, test_data

#### Categorical Values

In [None]:
def plot_numerical_countplots(train_data):
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(data[column])
        plt.title(f'Countplot of {column}', fontsize=16)
        plt.xlabel(column, fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        plt.show()

# Example usage
# Assuming 'df' is your DataFrame
plot_numerical_countplots(train_data)


In [None]:
#def plot_countplots_for_categorical(train_data):
    #Get a list of column name with categorical data
    #categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
    
    #loops through categorical columns and create count plots
    #for columns in categorical_columns:
        
        #Set the figure size
       # plt.figure(figsize=(8,6))
        #sns.countplot(x=columns,data=train_data)
        
        #plt.xlabel(column)
        #Set x-axis label
       # plt.ylabel('Count')
        #Set y-axis label
        
       # plt.title(f'Count plot of {column}')
        #Set plot title
        
       # plt.xticks(rotation=45)
        #Rotate x-axis labels for readability
        
        #plt.show() 

In [None]:
#Example usage with your Dataframe 'df'

plot_countplots_for_categorical(train_data)

In [None]:
#Explore categorical variables with count plots

sns.countplot(x='gender', data = train_data)

In [None]:
#Univariate Analysis

train_data['column_name'].hist()
plt.xlabel('X-axis Label')
plt.xlabel('X-axis Label')
plt.title('Title')
plt.show()

In [None]:
#Bivariate Analysis

sns.scatterplot(x='column1', y='column2', data = train_data)

In [None]:
#categorical variables

sns.countplot(x='category_column', data=train_data)

In [None]:
train_data['categorical_column'].value_counts()

In [None]:
train_data('education_institute').unique()

In [None]:
#Print out unique values of every column in the dataset

def print_unique_values(train_data):
    for column in train_data.columns:
        unique_values = 
train_data[column].unique()
        print(f"Unique values in {columns}: {unique_values}" )

In [None]:
#train_data['education_institute'].fillna(0, inplace = True)
#print()