In [4]:
# Import required libraries and dependencies
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
# Load the data into a Pandas DataFrame
df_census_income = pd.read_csv("adult.csv")

# Display sample data
df_census_income.head(10)


Unnamed: 0,Age,Workclass,Final Weight,Education,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Capital Gain,capital loss,Hours per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
# Review values of Workclass column
df_census_income['Workclass'].value_counts()

Unnamed: 0_level_0,count
Workclass,Unnamed: 1_level_1
Private,22696
Self-emp-not-inc,2541
Local-gov,2093
?,1836
State-gov,1298
Self-emp-inc,1116
Federal-gov,960
Without-pay,14
Never-worked,7


In [7]:
# Remove ending and leading values in Workclass column
df_census_income['Workclass'] = df_census_income['Workclass'].str.strip()
df_census_income['Workclass'] = df_census_income['Workclass'].str.lower()

In [8]:
# Filter out rows where 'Workclass' is 'never-worked' or 'without-pay'
df_census_income = df_census_income[
    ~df_census_income['Workclass'].isin(['never-worked', 'without-pay'])
]


In [9]:
# Replace "?" values
df_census_income['Workclass'] = df_census_income['Workclass'].replace('?', 'Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_census_income['Workclass'] = df_census_income['Workclass'].replace('?', 'Unknown')


In [10]:
# Review values of Workclass column
df_census_income['Workclass'].value_counts()

Unnamed: 0_level_0,count
Workclass,Unnamed: 1_level_1
private,22696
self-emp-not-inc,2541
local-gov,2093
Unknown,1836
state-gov,1298
self-emp-inc,1116
federal-gov,960


In [11]:
# Remove extra columns/ Renaming education number column
df_census_income = df_census_income.drop(columns=['Education','Capital Gain','capital loss'])
df_census_income.rename(columns={"EducationNum": "Education Rank"}, inplace=True)


df_census_income.head()

Unnamed: 0,Age,Workclass,Final Weight,Education Rank,Marital Status,Occupation,Relationship,Race,Gender,Hours per Week,Native Country,Income
0,39,state-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [12]:
# Review marital status column
df_census_income['Marital Status'].value_counts()


Unnamed: 0_level_0,count
Marital Status,Unnamed: 1_level_1
Married-civ-spouse,14967
Never-married,10674
Divorced,4442
Separated,1025
Widowed,992
Married-spouse-absent,417
Married-AF-spouse,23


In [13]:
# Replace "?" values in occupation column
df_census_income['Occupation'] = df_census_income['Occupation'].str.strip().replace('?', 'Unknown')
df_census_income['Occupation'].value_counts()


Unnamed: 0_level_0,count
Occupation,Unnamed: 1_level_1
Prof-specialty,4140
Craft-repair,4098
Exec-managerial,4066
Adm-clerical,3767
Sales,3650
Other-service,3294
Machine-op-inspct,2001
Unknown,1836
Transport-moving,1596
Handlers-cleaners,1369


In [14]:
# Ensure 'Income' column is of type string
df_census_income['Income'] = df_census_income['Income'].astype(str).str.strip()

# Replace values
df_census_income['Income'] = df_census_income['Income'].replace({
    '>50K': '1',
    '<=50K': '0'
})

# Rename column
df_census_income.rename(columns={"Income": "Income Greater than 50k"}, inplace=True)

# Verify the result
print(df_census_income['Income Greater than 50k'].value_counts())


Income Greater than 50k
0    24699
1     7841
Name: count, dtype: int64


In [15]:
# Review values of Native Country column
df_census_income['Native Country'].value_counts()

Unnamed: 0_level_0,count
Native Country,Unnamed: 1_level_1
United-States,29150
Mexico,643
?,583
Philippines,197
Germany,137
Canada,121
Puerto-Rico,114
El-Salvador,106
India,100
Cuba,95


In [16]:
# Clean up any whitespace from the column
df_census_income['Native Country'] = df_census_income['Native Country'].str.strip()

# Replace all non-'United-States' values with 'Other'
df_census_income['Native Country'] = df_census_income['Native Country'].apply(
    lambda x: 'Other' if x != 'United-States' else x
)

df_census_income['Native Country'].value_counts()

Unnamed: 0_level_0,count
Native Country,Unnamed: 1_level_1
United-States,29150
Other,3390


In [17]:
# Review dataframe
df_census_income.head()

Unnamed: 0,Age,Workclass,Final Weight,Education Rank,Marital Status,Occupation,Relationship,Race,Gender,Hours per Week,Native Country,Income Greater than 50k
0,39,state-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,0
1,50,self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0
2,38,private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0
3,53,private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0
4,28,private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Other,0


In [18]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("CensusIncomeAnalysis") \
    .getOrCreate()

In [19]:
# Convert pandas DataFrame to PySpark DataFrame
spark_df = spark.createDataFrame(df_census_income)

In [20]:
# Extract Income column as a separate DataFrame
income_df = spark_df.select("Income Greater than 50k")

# Show the first few rows
income_df.show()

+-----------------------+
|Income Greater than 50k|
+-----------------------+
|                      0|
|                      0|
|                      0|
|                      0|
|                      0|
|                      0|
|                      0|
|                      1|
|                      1|
|                      1|
|                      1|
|                      1|
|                      0|
|                      0|
|                      1|
|                      0|
|                      0|
|                      0|
|                      0|
|                      1|
+-----------------------+
only showing top 20 rows



In [21]:
# Drop the Income column from the main DataFrame
features_df = spark_df.drop("Income Greater than 50k")

# Show the first few rows
features_df.show()

+---+----------------+------------+--------------+--------------------+-----------------+--------------+-------------------+-------+--------------+--------------+
|Age|       Workclass|Final Weight|Education Rank|      Marital Status|       Occupation|  Relationship|               Race| Gender|Hours per Week|Native Country|
+---+----------------+------------+--------------+--------------------+-----------------+--------------+-------------------+-------+--------------+--------------+
| 39|       state-gov|       77516|            13|       Never-married|     Adm-clerical| Not-in-family|              White|   Male|            40| United-States|
| 50|self-emp-not-inc|       83311|            13|  Married-civ-spouse|  Exec-managerial|       Husband|              White|   Male|            13| United-States|
| 38|         private|      215646|             9|            Divorced|Handlers-cleaners| Not-in-family|              White|   Male|            40| United-States|
| 53|         private|

In [22]:
# Convert features_df and income_df from PySpark to pandas
features_pd = features_df.toPandas()
income_pd = income_df.toPandas()

income_pd.head()

Unnamed: 0,Income Greater than 50k
0,0
1,0
2,0
3,0
4,0


In [23]:
# Encode the categorical variables using get_dummies
features_pd = pd.get_dummies(features_pd)
features_pd = features_pd.astype(int)
features_pd.head(20)

Unnamed: 0,Age,Final Weight,Education Rank,Hours per Week,Workclass_Unknown,Workclass_federal-gov,Workclass_local-gov,Workclass_private,Workclass_self-emp-inc,Workclass_self-emp-not-inc,...,Relationship_ Wife,Race_ Amer-Indian-Eskimo,Race_ Asian-Pac-Islander,Race_ Black,Race_ Other,Race_ White,Gender_ Female,Gender_ Male,Native Country_Other,Native Country_United-States
0,39,77516,13,40,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,50,83311,13,13,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
2,38,215646,9,40,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
3,53,234721,7,40,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
4,28,338409,13,40,0,0,0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
5,37,284582,14,40,0,0,0,1,0,0,...,1,0,0,0,0,1,1,0,0,1
6,49,160187,5,16,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
7,52,209642,9,45,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
8,31,45781,14,50,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1
9,42,159449,13,40,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1


In [24]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(features_pd, income_pd, random_state=24)

In [25]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Logistical Regression Model Test 1

In [26]:
# Logistical Regression Model Test 1

In [27]:
# Logistical Regression Random State 1
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [28]:
# Train the data
classifier.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [29]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test_scaled)

In [30]:
# Create and save the confusion matrix for the data
conf_matrix = confusion_matrix(y_test, predictions)

# Print the confusion matrix for the training data
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[5686  444]
 [ 879 1126]]


In [31]:
# Print the classification report for the model
# Create and save the testing classification report
testing_report = classification_report(y_test, predictions)

# Print the testing classification report
print(testing_report)


              precision    recall  f1-score   support

           0       0.87      0.93      0.90      6130
           1       0.72      0.56      0.63      2005

    accuracy                           0.84      8135
   macro avg       0.79      0.74      0.76      8135
weighted avg       0.83      0.84      0.83      8135



## Results

Our first pass at a logistic regression model demonstrated an overall accuracy of 84%, with the class of people making under 50k achieving high precision and recall, while class of people making 50k or more showed lower precision and recall. The confusion matrix also reinforces this result, showing that the model performed better at predicting class 0 compared to class 1.

# Logistical Regression Model Test 2

In [32]:
# Logistical Regression Random State 2
classifier = LogisticRegression(solver='lbfgs', random_state=2)
classifier

In [33]:
# Train the data
classifier.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [34]:
# Create and save the confusion matrix for the data
conf_matrix = confusion_matrix(y_test, predictions)

# Print the confusion matrix for the training data
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[5686  444]
 [ 879 1126]]


In [35]:
# Print the classification report for the model
# Create and save the testing classification report
testing_report = classification_report(y_test, predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      6130
           1       0.72      0.56      0.63      2005

    accuracy                           0.84      8135
   macro avg       0.79      0.74      0.76      8135
weighted avg       0.83      0.84      0.83      8135



## Results
The second logistic regression model was trained with a random state set to 2 instead of 1 while all other parameters remained unchanged. This adjustment did not significantly impact the testing results as the model maintained an overall accuracy of 84%, indicating that the random state variation had minimal effect on model performance.

# Logistical Regression Model Test 3

In [36]:
# Logistical Regression Random State 7
classifier = LogisticRegression(solver='lbfgs', random_state=7)
classifier

In [37]:
# Train the data
classifier.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [38]:
# Create and save the confusion matrix for the data
conf_matrix = confusion_matrix(y_test, predictions)

# Print the confusion matrix for the training data
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[5686  444]
 [ 879 1126]]


In [39]:
# Print the classification report for the model
# Create and save the testing classification report
testing_report = classification_report(y_test, predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      6130
           1       0.72      0.56      0.63      2005

    accuracy                           0.84      8135
   macro avg       0.79      0.74      0.76      8135
weighted avg       0.83      0.84      0.83      8135



## Results

In the third logistic regression model, the random state was set to 7, but all other parameters remained consistent. The testing results showed no noticeable change, as the model still achieved an overall accuracy of 84%, indicating that this change in random state did not influence the performance of the model.

# KNN MODEL Test 1

In [40]:
# KNN Model neighbors 6
model = KNeighborsClassifier(n_neighbors=6, )

In [41]:
# Train the model
model.fit(X_train_scaled, y_train)

  return self._fit(X, y)


In [42]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['0', '0', '1', ..., '0', '0', '1'], dtype=object)

In [43]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[5709, 1053],
       [ 421,  952]])

In [44]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.84      0.89      6762
           1       0.47      0.69      0.56      1373

    accuracy                           0.82      8135
   macro avg       0.70      0.77      0.72      8135
weighted avg       0.85      0.82      0.83      8135



 ## Results

 We used the K-Nearest Neighbors (KNN) model to classify and set the parameter of n_neighbors to 6. The results show an overall accuracy of 78%, with high precision and recall for the people making under 50k, but significantly lower precision and recall for for the people making over 50k, indicating better performance in identifying the people making under 50k. The confusion matrix highlights this imbalance with more misclassifications occurring for our class 1 predictions

# KNN MODEL Test 2

In [45]:
# KNN Model neighbors 8
model = KNeighborsClassifier(n_neighbors=8, )

In [46]:
# Train the model
model.fit(X_train_scaled, y_train)

  return self._fit(X, y)


In [47]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['0', '0', '1', ..., '0', '0', '1'], dtype=object)

In [48]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[5676,  982],
       [ 454, 1023]])

In [49]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.85      0.89      6658
           1       0.51      0.69      0.59      1477

    accuracy                           0.82      8135
   macro avg       0.72      0.77      0.74      8135
weighted avg       0.85      0.82      0.83      8135



## Results


In this next model, we increased the number of neighbors  from 6 to 8. This change slightly improved the recall and F1-score for class 1 predictions while maintaining the same accuracy of 78%, suggesting that increasing the number of neighbors helped slightly reduce misclassifications for the minority class without affecting overall performance.

# KNN MODEL Test 3

In [50]:
# KNN Model neighbors 3
model = KNeighborsClassifier(n_neighbors=3, )

In [51]:
# Train the model
model.fit(X_train_scaled, y_train)

  return self._fit(X, y)


In [52]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['0', '0', '1', ..., '0', '0', '1'], dtype=object)

In [53]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[5445,  900],
       [ 685, 1105]])

In [54]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87      6345
           1       0.55      0.62      0.58      1790

    accuracy                           0.81      8135
   macro avg       0.72      0.74      0.73      8135
weighted avg       0.81      0.81      0.81      8135



## Results


In the final KNN model test, we reduced the number of neighbors to 3. This resulted the same overall accuracy of 78, and class 1 predictions showed slight improvements in precision while recall dropped to 0.62. This may indicate a bit of a tradeoff between precision and recall for the positive class.


# Random Forest Attempt 1



In [55]:
# Random Forest Model- Estimators 1500/Random State- 68
rf_model = RandomForestClassifier(n_estimators=1500, random_state=68)

In [56]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [57]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [58]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [59]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5607,523
Actual 1,860,1145


Accuracy Score : 0.8299938537185003
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      6130
           1       0.69      0.57      0.62      2005

    accuracy                           0.83      8135
   macro avg       0.78      0.74      0.76      8135
weighted avg       0.82      0.83      0.82      8135



## Results

In this Random Forest model, we wanted to try classifying the test data using 1500 estimators and a random state of 68. The model achieved one of our highest overall accuracy points of 83%, with high precision and recall for the group making under 50k, while the people making over 50k had both lower precision and recall. The confusion matrix also indicates that the model performs better yet again at predicting 0's compared to 1's.

# Random Forest Attempt 2


In [60]:
# Random Forest Model- Estimators 2000/Random State- 68
rf_model = RandomForestClassifier(n_estimators=2000, random_state=68)

In [61]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [62]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [63]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [64]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5599,531
Actual 1,856,1149


Accuracy Score : 0.8295021511985249
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      6130
           1       0.68      0.57      0.62      2005

    accuracy                           0.83      8135
   macro avg       0.78      0.74      0.76      8135
weighted avg       0.82      0.83      0.82      8135



## Results


In this attempt, the number of estimators was increased to 2000 while the random state remained the same at 68. This change did not significantly affect the testing results as the model still achieved an overall accuracy of 83% with similar precision and recall for both classes, suggesting that increasing the number of estimators did not really do much to improve performance.

# Random Forest Attempt 3

In [65]:
# Randome Forest Model Test 3
rf_model = RandomForestClassifier(
    n_estimators=2000,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=68
)

In [66]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


In [67]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [68]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [69]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5769,361
Actual 1,933,1072


Accuracy Score : 0.8409342347879533
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      6130
           1       0.75      0.53      0.62      2005

    accuracy                           0.84      8135
   macro avg       0.80      0.74      0.76      8135
weighted avg       0.83      0.84      0.83      8135



## Results

In our final attempt, we added in some hyperparameters. We set the maximum depth to 15, minimum samples split to 10, and minimum samples leaf to 5. We kept the number of estimators at 2000 and the random state at 68 to get a better view at what our changes did. These changes resulted in a slight improvement in accuracy to 84% along with better precision and recall for the class of folks making 50k or more. We think the hyperparameters ultimately helped with better handling of the positive class. The confusion matrix also shows fewer misclassifications for class 1 compared to previous attempts.