### Step 1: Imports and Reading Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("adult.csv")

### Step 2: Understand The Data


In [None]:
print(df.shape[0])
print(df.shape[1])



In [None]:
df.isnull().sum()


In [None]:
df.dtypes


In [None]:
df.describe()


In [None]:
for column in df.columns:
    print(f"{column} : {len(df[column].unique())}")

In [None]:
df

### Step 3: Clean And Prepare The Data

In [None]:
df = pd.concat([df.drop('occupation', axis=1), pd.get_dummies(df.occupation).add_prefix('occupation_')], axis=1)
df = pd.concat([df.drop('workclass', axis=1), pd.get_dummies(df.workclass).add_prefix('workclass_')], axis=1)
df = df.drop('education', axis=1)
df = pd.concat([df.drop('marital-status', axis=1), pd.get_dummies(df['marital-status']).add_prefix('marital-status_')], axis=1)
df = pd.concat([df.drop('relationship', axis=1), pd.get_dummies(df.relationship).add_prefix('relationship_')], axis=1)
df = pd.concat([df.drop('race', axis=1), pd.get_dummies(df.race).add_prefix('race_')], axis=1)
df = pd.concat([df.drop('native-country', axis=1), pd.get_dummies(df['native-country']).add_prefix('native-country_')], axis=1)

In [None]:
df = df.drop('fnlwgt', axis= 1)

In [None]:
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)
df['income'] = df['income'].apply(lambda x: 1 if x == '>50k' else 0)

In [None]:
correlations = df.corr()['income'].abs()
sortedCorrelations = correlations.sort_values()
numColsToTrop = int(0.8 * len(df.columns))
colsToDrop = sortedCorrelations.iloc[:numColsToTrop].index
dfDropped = df.drop(colsToDrop, axis=1)

In [None]:
plt.figure(figsize=(18,10))

sns.heatmap(df.corr(), annot= False, cmap='coolwarm')

### Step 4: Split The Data Into Training/Test Sets


In [None]:
trainDf, testDf = train_test_split(df, test_size= 0.2)

In [None]:
trainX = trainDf.drop('income', axis= 1)
trainY = trainDf['income']

testX = testDf.drop('income', axis= 1)
testY = testDf['income']

In [None]:
forest = RandomForestRegressor()

forest.fit(trainX, trainY)

In [None]:
forest.score(testX,testY)

### Step 5: Feature Engineering

In [None]:
importances = dict(zip(forest.feature_names_in_, forest.feature_importances_))
importances = {k: v for k, v in sorted(importances.items(), key=lambda x: x[1], reverse=True)}

In [None]:
importances

### Step 6: Hyperparameter Tuning

In [None]:
paramGrid = {
    'n_estimators': [50, 100, 250],
    'max_depth': [5, 10, 30, None],
    'min_samples_split': [2, 4],
    'max_features': ['sqrt', 'log2']
}

gridSearch = GridSearchCV(estimator= RandomForestClassifier(), param_grid= paramGrid, verbose= 10)

In [None]:
gridSearch.fit(trainX, trainY)

In [53]:
forest = gridSearch.best_estimator_

In [54]:
forest.score(testX, testY)

1.0