### AZUBI CAPSTONE PROJECT

#### IMPORT LIBRARY PACKAGES

In [1]:
#Data Handling
import numpy as np
import pandas as pd

#Visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from termcolor import colored

# Feature Processing (Scikit-learn processing, etc. )

# Evaluation Metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

#Other packages
import warnings
warnings.filterwarnings('always') 

#### Load the Data.

For this project, there are two datasets, namely:
* <b> Train data: </b> This is the data which we will be using to train the model. Since we are solving a classification problem, we will have a column in train dataset corresponding to the target labels. 
* <b> Test data: </b> This is the data on which the predictions will be made based on the model trained on train dataset. 

In [2]:
train_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Train.csv")

### EXPLORATORY DATA EXPLORATION

In [3]:
#Overview of the train dataset.

train_data.head()

Unnamed: 0,ID,age,gender,education,class,education_institute,marital_status,race,is_hispanic,employment_commitment,...,country_of_birth_mother,migration_code_change_in_msa,migration_prev_sunbelt,migration_code_move_within_reg,migration_code_change_in_reg,residence_1_year_ago,old_residence_reg,old_residence_state,importance_of_record,income_above_limit
0,ID_TZ0000,79,Female,High school graduate,,,Widowed,White,All other,Not in labor force,...,US,?,?,?,?,,,,1779.74,Below limit
1,ID_TZ0001,65,Female,High school graduate,,,Widowed,White,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,2366.75,Below limit
2,ID_TZ0002,21,Male,12th grade no diploma,Federal government,,Never married,Black,All other,Children or Armed Forces,...,US,unchanged,,unchanged,unchanged,Same,,,1693.42,Below limit
3,ID_TZ0003,2,Female,Children,,,Never married,Asian or Pacific Islander,All other,Children or Armed Forces,...,India,unchanged,,unchanged,unchanged,Same,,,1380.27,Below limit
4,ID_TZ0004,70,Male,High school graduate,,,Married-civilian spouse present,White,All other,Not in labor force,...,US,?,?,?,?,,,,1580.79,Below limit


In [4]:
#Overview of all the columns in the dataset.

train_data.columns

Index(['ID', 'age', 'gender', 'education', 'class', 'education_institute',
       'marital_status', 'race', 'is_hispanic', 'employment_commitment',
       'unemployment_reason', 'employment_stat', 'wage_per_hour',
       'is_labor_union', 'working_week_per_year', 'industry_code',
       'industry_code_main', 'occupation_code', 'occupation_code_main',
       'total_employed', 'household_stat', 'household_summary',
       'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit',
       'tax_status', 'gains', 'losses', 'stocks_status', 'citizenship',
       'mig_year', 'country_of_birth_own', 'country_of_birth_father',
       'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg',
       'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record',
       'income_above_limit'],
      dtype='object')

In [5]:
#Create a copy of the original data

data = train_data.copy() 

#### RENAME SOME COLUMNS

In [6]:
#drop multiple columns by name
train_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

#### RENAME SOME COLUMNS

In [7]:
# Define the new names for the columns to be renamed.
new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

train_data.rename(columns=new_columns, inplace=True)

In [8]:
#View dataset after some columns have been dropped and renamed.

print(train_data.columns)

Index(['age', 'gender', 'education', 'class', 'marital_status', 'race',
       'hourly_wage', 'working_week_per_year', 'occupation', 'gains', 'losses',
       'citizenship', 'country_of_birth', 'income_above_limit'],
      dtype='object')


In [9]:
#Check the dataframe after some columns have been dropped and renamed.

train_data.head()

Unnamed: 0,age,gender,education,class,marital_status,race,hourly_wage,working_week_per_year,occupation,gains,losses,citizenship,country_of_birth,income_above_limit
0,79,Female,High school graduate,,Widowed,White,0,52,,0,0,Native,US,Below limit
1,65,Female,High school graduate,,Widowed,White,0,0,,0,0,Native,US,Below limit
2,21,Male,12th grade no diploma,Federal government,Never married,Black,500,15,Adm support including clerical,0,0,Native,US,Below limit
3,2,Female,Children,,Never married,Asian or Pacific Islander,0,0,,0,0,Native,US,Below limit
4,70,Male,High school graduate,,Married-civilian spouse present,White,0,0,,0,0,Native,US,Below limit


In [10]:
#Check the shape of the dataset.
print(f"The shape of the train dataset is: {train_data.shape}")

The shape of the train dataset is: (209499, 14)


In [11]:
#Check the shape of the dataset.

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209499 entries, 0 to 209498
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   age                    209499 non-null  int64 
 1   gender                 209499 non-null  object
 2   education              209499 non-null  object
 3   class                  104254 non-null  object
 4   marital_status         209499 non-null  object
 5   race                   209499 non-null  object
 6   hourly_wage            209499 non-null  int64 
 7   working_week_per_year  209499 non-null  int64 
 8   occupation             103805 non-null  object
 9   gains                  209499 non-null  int64 
 10  losses                 209499 non-null  int64 
 11  citizenship            209499 non-null  object
 12  country_of_birth       209499 non-null  object
 13  income_above_limit     209499 non-null  object
dtypes: int64(5), object(9)
memory usage: 22.4+ MB


#### CHECK FOR MISSING VALUES

In [12]:
#Check the remaining columns if there are any missing values.

train_data.isnull().sum()

age                           0
gender                        0
education                     0
class                    105245
marital_status                0
race                          0
hourly_wage                   0
working_week_per_year         0
occupation               105694
gains                         0
losses                        0
citizenship                   0
country_of_birth              0
income_above_limit            0
dtype: int64

#### INSIGHTS

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 105245 </b> missing values.
* Occupation code main columns with a total number of <b> 105694 </b> missing values.

In [13]:
train_data['gender'].unique()

array([' Female', ' Male'], dtype=object)

In [14]:
train_data['gender'].value_counts()

gender
 Female    108784
 Male      100715
Name: count, dtype: int64

In [15]:
train_data['education'].unique()

array([' High school graduate', ' 12th grade no diploma', ' Children',
       ' Bachelors degree(BA AB BS)', ' 7th and 8th grade', ' 11th grade',
       ' 9th grade', ' Masters degree(MA MS MEng MEd MSW MBA)',
       ' 10th grade', ' Associates degree-academic program',
       ' 1st 2nd 3rd or 4th grade', ' Some college but no degree',
       ' Less than 1st grade', ' Associates degree-occup /vocational',
       ' Prof school degree (MD DDS DVM LLB JD)', ' 5th or 6th grade',
       ' Doctorate degree(PhD EdD)'], dtype=object)

In [16]:
train_data['education'].value_counts()

education
 High school graduate                      50627
 Children                                  49685
 Some college but no degree                29320
 Bachelors degree(BA AB BS)                20979
 7th and 8th grade                          8438
 10th grade                                 7905
 11th grade                                 7260
 Masters degree(MA MS MEng MEd MSW MBA)     6861
 9th grade                                  6540
 Associates degree-occup /vocational        5650
 Associates degree-academic program         4494
 5th or 6th grade                           3542
 12th grade no diploma                      2282
 1st 2nd 3rd or 4th grade                   1917
 Prof school degree (MD DDS DVM LLB JD)     1852
 Doctorate degree(PhD EdD)                  1318
 Less than 1st grade                         829
Name: count, dtype: int64

In [17]:
#Renaming value in the Education column

train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('High school graduate','High school'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Doctorate degree(PhD EdD)','Doctorate'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Bachelors degree(BA AB BS)','Undergraduate'))

In [18]:
train_data['education'].unique()

array([' High school', ' 12th grade no diploma', ' Children',
       ' Undergraduate', ' 7th and 8th grade', ' 11th grade',
       ' 9th grade', ' Masters degree(MA MS MEng MEd MSW MBA)',
       ' 10th grade', ' Associates degree-academic program',
       ' 1st 2nd 3rd or 4th grade', ' Some college but no degree',
       ' Less than 1st grade', ' Associates degree-occup /vocational',
       ' Prof school degree (MD DDS DVM LLB JD)', ' 5th or 6th grade',
       ' Doctorate'], dtype=object)

In [19]:
train_data['class'].unique()

array([nan, ' Federal government', ' Private', ' Local government',
       ' Self-employed-incorporated', ' Self-employed-not incorporated',
       ' State government', ' Without pay', ' Never worked'], dtype=object)

In [20]:
train_data['class'].value_counts()

class
 Private                           75617
 Self-employed-not incorporated     8957
 Local government                   8093
 State government                   4480
 Self-employed-incorporated         3444
 Federal government                 3047
 Never worked                        449
 Without pay                         167
Name: count, dtype: int64

In [21]:
train_data['marital_status'].unique()

array([' Widowed', ' Never married', ' Married-civilian spouse present',
       ' Divorced', ' Married-spouse absent', ' Separated',
       ' Married-A F spouse present'], dtype=object)

In [22]:
train_data['marital_status'].value_counts()

marital_status
 Never married                      90723
 Married-civilian spouse present    88407
 Divorced                           13456
 Widowed                            11029
 Separated                           3596
 Married-spouse absent               1568
 Married-A F spouse present           720
Name: count, dtype: int64

In [23]:
train_data['race'].unique()

array([' White', ' Black', ' Asian or Pacific Islander',
       ' Amer Indian Aleut or Eskimo', ' Other'], dtype=object)

In [24]:
train_data['race'].value_counts()

race
 White                          175709
 Black                           21394
 Asian or Pacific Islander        6056
 Other                            3927
 Amer Indian Aleut or Eskimo      2413
Name: count, dtype: int64

In [25]:
train_data['occupation'].unique()

array([nan, ' Adm support including clerical', ' Other service',
       ' Executive admin and managerial', ' Sales',
       ' Machine operators assmblrs & inspctrs',
       ' Precision production craft & repair', ' Professional specialty',
       ' Handlers equip cleaners etc ',
       ' Transportation and material moving',
       ' Farming forestry and fishing', ' Private household services',
       ' Technicians and related support', ' Protective services',
       ' Armed Forces'], dtype=object)

In [26]:
train_data['occupation'].value_counts()

occupation
 Adm support including clerical           15351
 Professional specialty                   14544
 Executive admin and managerial           13107
 Other service                            12856
 Sales                                    12487
 Precision production craft & repair      11207
 Machine operators assmblrs & inspctrs     6650
 Handlers equip cleaners etc               4340
 Transportation and material moving        4244
 Farming forestry and fishing              3273
 Technicians and related support           3136
 Protective services                       1700
 Private household services                 878
 Armed Forces                                32
Name: count, dtype: int64

In [29]:
#Renaming value in the occupation column

train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Adm support including clerical','Administration'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Executive admin and managerial','Executive and Managerial'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Transportation and material moving','Transport'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Technicians and related support','Technical'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Farming forestry and fishing','Agriculture'))

In [30]:
train_data['occupation'].unique()

array(['nan', 'Administration', ' Other service',
       'Executive and Managerial', ' Sales',
       ' Machine operators assmblrs & inspctrs',
       ' Precision production craft & repair', ' Professional specialty',
       ' Handlers equip cleaners etc ', 'Transport', 'Agriculture',
       ' Private household services', 'Technical', ' Protective services',
       ' Armed Forces'], dtype=object)

In [None]:
def check_unique_values_and_value_counts(train_data, columns_to_check):
    """
    Check unique values and value counts of specific columns in a dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset to analyze.
    - columns_to_check (list): List of column names to check.

    Returns:
    - None
    """
    for column in columns_to_check:
        unique_values = train_data[column].unique()
        value_counts = train_data[column].value_counts()
        
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print("Value counts:")
        print(value_counts)
        print("-" * 40)

In [None]:
#Drop unnecesarry columns from the dataset
train_data.drop(columns= ['ID', 'education_institute', 'is_hispanic','employment_commitment','unemployment_reason', 'employment_stat',
       'is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code','total_employed', 'household_stat', 
       'household_summary', 'under_18_family', 'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status',
       'mig_year', 'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state', 'importance_of_record'], axis=1, inplace=True)

In [None]:
# Filter out only the numerical columns
numerical_columns = train_data.select_dtypes(include=['int', 'float'])

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create a boxplot using Seaborn
sns.boxplot(data=train_data)

plt.title("Boxplot of Multiple Columns")
plt.xlabel("Columns")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


#### Categorical Values

In [None]:
def plot_numerical_countplots(train_data):
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(data[column])
        plt.title(f'Countplot of {column}', fontsize=16)
        plt.xlabel(column, fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        plt.show()

# Example usage
# Assuming 'df' is your DataFrame
plot_numerical_countplots(train_data)


In [None]:
#def plot_countplots_for_categorical(train_data):
    #Get a list of column name with categorical data
    #categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
    
    #loops through categorical columns and create count plots
    #for columns in categorical_columns:
        
        #Set the figure size
       # plt.figure(figsize=(8,6))
        #sns.countplot(x=columns,data=train_data)
        
        #plt.xlabel(column)
        #Set x-axis label
       # plt.ylabel('Count')
        #Set y-axis label
        
       # plt.title(f'Count plot of {column}')
        #Set plot title
        
       # plt.xticks(rotation=45)
        #Rotate x-axis labels for readability
        
        #plt.show() 

In [None]:
#Example usage with your Dataframe 'df'

plot_countplots_for_categorical(train_data)

In [None]:
#Explore categorical variables with count plots

sns.countplot(x='gender', data = train_data)

In [None]:
#Univariate Analysis

train_data['column_name'].hist()
plt.xlabel('X-axis Label')
plt.xlabel('X-axis Label')
plt.title('Title')
plt.show()

In [None]:
#Bivariate Analysis

sns.scatterplot(x='column1', y='column2', data = train_data)

In [None]:
#categorical variables

sns.countplot(x='category_column', data=train_data)

In [None]:
train_data['categorical_column'].value_counts()

In [None]:
#train_data['education_institute'].fillna(0, inplace = True)
#print()