In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [None]:
df = pd.read_csv ("census-income.data.csv", index_col = None, names=['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', "hours_per_week", "native_country", 'income'], sep=',\s',na_values=["?"],engine="python")

In [None]:
df.head(20)

In [None]:
df['income']

In [None]:
#Checking for missing values before encoding
missing_values = df.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
df = df.dropna()

In [None]:
#Checking for missing values before encoding
missing_values = df.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
#encoding categorical variables to numeric codes
from sklearn.preprocessing import LabelEncoder

#calling labelencoder
lab_enc = LabelEncoder()

#performing label encoding in the select features
df['workclass'] = lab_enc.fit_transform(df['workclass'])
df['education'] = lab_enc.fit_transform(df['education'])
df['marital_status'] = lab_enc.fit_transform(df['marital_status'])
df['occupation'] = lab_enc.fit_transform(df['occupation'])
df['relationship'] = lab_enc.fit_transform(df['relationship'])
df['race'] = lab_enc.fit_transform(df['race'])
df['sex'] = lab_enc.fit_transform(df['sex'])
df['native_country'] = lab_enc.fit_transform(df['native_country'])
df['age'] = lab_enc.fit_transform(df['age'])


In [None]:
print(df['income'].value_counts())
#  # <=50K    24720
#  # >50K      7841

#Encoding for label. If <=50k then 0 else 1
label_info = {'<=50K':0, '>50K':1}
df['income'] = df['income'].map(label_info)
# # source:https://stackoverflow.com/questions/65716571/encoding-column-pandas-using-if-condition

# # after encoding 
print(f"After encoding", df['income'].value_counts())

In [None]:
###imputing
df1 = pd.read_csv ("census-income.data.csv", index_col = None, names=['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', "hours_per_week", "native_country", 'income'], sep=',\s',na_values=["?"],engine="python")

df1.head(30)

In [None]:
#Selecting features and splitting the datasets into features and label
selected_features = ['age', 'workclass', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', "hours_per_week", "native_country"]

x = df1[selected_features] #feature
y = df1.income #label

missing_values = df1.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
df1.fillna("missing", inplace=True)
df1.replace('missing', np.nan, inplace=True)
df1.head(30)

In [None]:
#Separation
category = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
df1_cat = df1[category]
numericall = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'income']
df1_num = df1[numericall]

missing_values = df1_cat.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
from fancyimpute import KNN

df1_encoded = pd.get_dummies(df1_cat)

imputer = KNN()
df1_imputed = pd.DataFrame(imputer.fit_transform(df1_encoded), columns=df1_encoded.columns)


print("Imputed DataFrame:")
print(df1_imputed)

In [None]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', "hours_per_week", "native_country", 'income']

#Separate dfs with rows containing missing values and not containing missing values. 
df1_nmiss = df1[df1[features].notnull().any(axis=1)]
df1_miss = df1[df1[features].isnull().any(axis=1)]

In [None]:
df1_nmiss.head(15)

In [None]:
df1_miss.head(15)

In [None]:
#https://scikit-learn.org/stable/modules/impute.html
#Implementing knn to impute the missing values

from sklearn.impute import KNNImputer

imputes = KNNImputer(n_neighbors=15, weights="uniform", metric='nan_euclidean')



In [None]:
#imputed

imputes.fit(df1)

In [None]:

test = pd.read_csv ("census-income.test.csv", index_col = None, names=['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', "hours_per_week", "native_country", 'income'])

In [None]:
test.head(30)

In [None]:
#converting ? to nan
test[test==' ']= np.nan

test.head(15)

In [None]:
missing_values = test.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
test.age.unique()

In [None]:
#Removing all missing values
test = test.dropna()
missing_values = test.isnull().sum()
print("missing_values:")
print(missing_values)

In [None]:
#Imputing missing values for income dataset. 
import pandas as pd
import numpy as np

# Read CSV file and replace '?' with NaN
df1 = pd.read_csv("census-income.data.csv", index_col=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                                                                  'marital_status', 'occupation', 'relationship', 'race',
                                                                  'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
                                                                  'native_country', 'income'], sep=',\s', na_values=["?"], engine="python")

# Replace NaNs with 'missing'
df1.fillna("missing", inplace=True)

# Encode categorical columns
df1_encoded = pd.get_dummies(df1)

# Perform KNN imputation
from fancyimpute import KNN

imputer = KNN()
df1_imputed = pd.DataFrame(imputer.fit_transform(df1_encoded), columns=df1_encoded.columns)

print("Imputed DataFrame:")
print(df1_imputed)
