## AZUBI CAPSTONE PROJECT

#### IMPORT LIBRARY PACKAGES

In [None]:
#Data Handling
import numpy as np
import pandas as pd

#Visualizations
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from termcolor import colored

# Feature Processing (Scikit-learn processing, etc. )
from sklearn.preprocessing import LabelEncoder

# Evaluation Metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

#Other packages
import warnings
warnings.filterwarnings('always') 

#### Load the Data.

For this project, there are two datasets, namely:
* <b> Train data: </b> This is the data which we will be using to train the model. Since we are solving a classification problem, we will have a column in train dataset corresponding to the target labels. 
* <b> Test data: </b> This is the data on which the predictions will be made based on the model trained on train dataset. 

In [None]:
train_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Train.csv")
test_data = pd.read_csv("C:/Users/KWABENABOATENG/Desktop/AZUBI AFRICA/AZUBI CAPSTONE/AZUBI-CAPSTONE-PROJECT/DATASETS/Test.csv")

## EXPLORATORY DATA EXPLORATION

### EXPLORE THE TRAIN DATASET

In [None]:
#Overview of the train dataset.

train_data.head()

In [None]:
#Check the shape of the train dataset.

print(f"The shape of the train dataset is: {train_data.shape}")

In [None]:
#Check the shape of the train dataset.

train_data.info()

In [None]:
train_data.isnull().sum()

#### INSIGHTS

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 105245 </b> missing values.
* Education_institute columns with a total number of <b> 196197 </b> missing values.
* Unemployment_reason columns with a total number of <b> 202979 </b> missing values.
* Is_labor_union  columns with a total number of <b> 189420 </b> missing values.
* Occupation code main columns with a total number of <b> 105694 </b> missing values.
* Under_18_family  columns with a total number of <b> 151654 </b> missing values.
* veterans_admin_questionnaire columns with a total number of <b> 207415 </b> missing values.
* migration_code_change_in_msa columns with a total number of <b> 1588 </b> missing values.
* migration_prev_sunbelt columns with a total number of <b> 88452 </b> missing values.
* migration_code_move_within_reg columns with a total number of <b> 1588 </b> missing values.
* migration_code_change_in_reg columns with a total number of <b> 1588 </b> missing values.
* residence_1_year_ago columns with a total number of <b> 106284 </b> missing values.
* old_residence_reg columns with a total number of <b> 193148 </b> missing values.
* old_residence_state columns with a total number of <b> 193148 </b> missing values.

In [None]:
train_data.describe()

### EXPLORE THE TEST DATASET.

In [None]:
#Overview of the test dataset.

test_data.head()

In [None]:
#Check the shape of the test dataset.

print(f"The shape of the train dataset is: {test_data.shape}")

In [None]:
#Check the shape of the test dataset.

test_data.info()

In [None]:
test_data.isnull().sum()

#### INSIGHTS FOR THE TEST DATASET

From the records above, it clearly shows that after some columns were dropped to aid in making the work easier. <br>
Some columns had some missing values and they are :<br>
* Class columns with a total number of <b> 45079 </b> missing values.
* Occupation code main columns with a total number of <b> 45273 </b> missing values.

In [None]:
test_data.describe()

## FEATURE ENGINEERING

In [None]:
#Create a copy of the original train dataset

data = train_data.copy() 

In [None]:
#Create a copy of the original test dataset

data = test_data.copy() 

In [None]:
#Overview of all the columns in the train dataset.

train_data.columns

In [None]:
#Overview of all the columns in the test dataset.

test_data.columns

### DROP SOME COLUMNS
#### For the purposes of this project, we will drop some columns that will not be relevant for the machine learning

In [None]:
#drop the following columns in the train dataset.

train_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

In [None]:
#drop the following columns in the test dataset.

test_data.drop(columns=['ID', 'education_institute', 'is_hispanic', 'employment_commitment', 'unemployment_reason', 
                         'employment_stat','is_labor_union', 'industry_code', 'industry_code_main', 'occupation_code',
                         'total_employed', 'household_stat', 'household_summary', 'under_18_family',
                         'veterans_admin_questionnaire', 'vet_benefit', 'tax_status', 'stocks_status', 'mig_year',
                         'country_of_birth_father', 'country_of_birth_mother', 'migration_code_change_in_msa',
                         'migration_prev_sunbelt', 'migration_code_move_within_reg', 'migration_code_change_in_reg',
                         'residence_1_year_ago', 'old_residence_reg', 'old_residence_state', 'importance_of_record'], 
                         inplace=True)

#### RENAME SOME COLUMNS

In [None]:
# Define the new names for some columns to be renamed in the train dataset.

new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

train_data.rename(columns=new_columns, inplace=True)

In [None]:
# Define the new names for some columns to be renamed in the test dataset.

new_columns = {'wage_per_hour': 'hourly_wage', 'occupation_code_main': 'occupation', 
               'country_of_birth_own': 'country_of_birth'}

test_data.rename(columns=new_columns, inplace=True)

In [None]:
#View the train dataset after some columns have been dropped and renamed.

print(train_data.columns)

In [None]:
#View the test dataset after some columns have been dropped and renamed.

print(test_data.columns)

In [None]:
#Check the train dataframe after some columns have been dropped and renamed.

train_data.head()

In [None]:
#Check the test dataframe after some columns have been dropped and renamed.

test_data.head()

In [None]:
#Encode the following columns in the test dataset - Gender and Income above limit

le = LabelEncoder()
train_data[['gender', 'income_above_limit']] = train_data[['gender', 'income_above_limit']].apply(LabelEncoder().fit_transform)

In [None]:
#Encode the following columns in the train dataset - Gender and Income above limit

le = LabelEncoder()
label = le.fit_transform(test_data['gender'])
test_data.drop("gender", axis=1, inplace=True)
test_data["gender"] = label

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.hist(figsize=(12,9), layout=(3,3))

In [None]:

train_data.plot(kind = 'box', figsize=(10,5), layout=(3,2), );

In [None]:
sns.heatmap(train_data.corr(), annot=True);

#### CHECK FOR MISSING VALUES

In [None]:
#Check the remaining columns if there are any missing values.

train_data.isnull().sum()

In [None]:
train_data['gender'].unique()

In [None]:
train_data['gender'].value_counts()

In [None]:
train_data['education'].unique()

In [None]:
train_data['education'].value_counts()

In [None]:
#Renaming value in the Education column

train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('High school graduate','High school'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Doctorate degree(PhD EdD)','Doctorate'))
train_data['education'] = train_data['education'].apply(lambda x : str(x).replace('Bachelors degree(BA AB BS)','Undergraduate'))

In [None]:
train_data['education'].unique()

In [None]:
train_data['class'].unique()

In [None]:
train_data['class'].value_counts()

In [None]:
train_data['marital_status'].unique()

In [None]:
train_data['marital_status'].value_counts()

In [None]:
train_data['race'].unique()

In [None]:
train_data['race'].value_counts()

In [None]:
train_data['occupation'].unique()

In [None]:
train_data['occupation'].value_counts()

In [None]:
#Renaming value in the occupation column

train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Adm support including clerical','Administration'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Executive admin and managerial','Executive and Managerial'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Transportation and material moving','Transport'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Technicians and related support','Technical'))
train_data['occupation'] = train_data['occupation'].apply(lambda x : str(x).replace(' Farming forestry and fishing','Agriculture'))

In [None]:
train_data['occupation'].unique()

In [None]:
def check_unique_values_and_value_counts(train_data, columns_to_check):
    """
    Check unique values and value counts of specific columns in a dataset.

    Parameters:
    - dataset (pd.DataFrame): The dataset to analyze.
    - columns_to_check (list): List of column names to check.

    Returns:
    - None
    """
    for column in columns_to_check:
        unique_values = train_data[column].unique()
        value_counts = train_data[column].value_counts()
        
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print("Value counts:")
        print(value_counts)
        print("-" * 40)

In [None]:
# Filter out only the numerical columns
numerical_columns = train_data.select_dtypes(include=['int', 'float'])

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Create a boxplot using Seaborn
sns.boxplot(data=train_data)

plt.title("Boxplot of Multiple Columns")
plt.xlabel("Columns")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


#### Categorical Values

In [None]:
def plot_numerical_countplots(train_data):
    numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(data[column])
        plt.title(f'Countplot of {column}', fontsize=16)
        plt.xlabel(column, fontsize=14)
        plt.ylabel('Count', fontsize=14)
        plt.xticks(rotation=45)
        plt.show()

# Example usage
# Assuming 'df' is your DataFrame
plot_numerical_countplots(train_data)


In [None]:
#def plot_countplots_for_categorical(train_data):
    #Get a list of column name with categorical data
    #categorical_columns = train_data.select_dtypes(include=['object']).columns.tolist()
    
    #loops through categorical columns and create count plots
    #for columns in categorical_columns:
        
        #Set the figure size
       # plt.figure(figsize=(8,6))
        #sns.countplot(x=columns,data=train_data)
        
        #plt.xlabel(column)
        #Set x-axis label
       # plt.ylabel('Count')
        #Set y-axis label
        
       # plt.title(f'Count plot of {column}')
        #Set plot title
        
       # plt.xticks(rotation=45)
        #Rotate x-axis labels for readability
        
        #plt.show() 

In [None]:
#Example usage with your Dataframe 'df'

plot_countplots_for_categorical(train_data)

In [None]:
#Explore categorical variables with count plots

sns.countplot(x='gender', data = train_data)

In [None]:
#Univariate Analysis

train_data['column_name'].hist()
plt.xlabel('X-axis Label')
plt.xlabel('X-axis Label')
plt.title('Title')
plt.show()

In [None]:
#Bivariate Analysis

sns.scatterplot(x='column1', y='column2', data = train_data)

In [None]:
#categorical variables

sns.countplot(x='category_column', data=train_data)

In [None]:
train_data['categorical_column'].value_counts()

In [None]:
#train_data['education_institute'].fillna(0, inplace = True)
#print()