In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
data = pd.read_csv('adult.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
48841,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [7]:
data.isnull().sum() #this shows that there are no null values in the data

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [8]:
data['occupation'].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', '?',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [9]:
data['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [10]:
# Removing duplicate rows
data.drop_duplicates(inplace=True)


In [11]:
data.replace('?', np.nan, inplace=True)

# Iterate through columns
for column in data.columns:
    if data[column].dtype == 'object':
        # For non-numeric columns, remove rows with missing values
        data = data.dropna(subset=[column])
    else:
        # For numeric columns, fill missing values with the mean
        data[column].fillna(data[column].mean(), inplace=True)
#since our data had a lot of cells labelled as '?' we replace that too since its undefined data, data completeness

In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [13]:
#Here we are Encoding the Categorical Variables (One-Hot Encoding)
data = pd.get_dummies(data, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])


In [14]:
# Normalization (MinMax Scaling) for 'Age' and 'Hours per Week'
scaler = MinMaxScaler()
data[['age', 'hours-per-week']] = scaler.fit_transform(data[['age', 'hours-per-week']])


In [15]:
# Standardization (Z-score Scaling) for 'Capital Gain' and 'Capital Loss'
scaler = StandardScaler()
data[['capital-gain', 'capital-loss']] = scaler.fit_transform(data[['capital-gain', 'capital-loss']])

# Now, our data is transformed, normalized, and standardized


In [16]:
columns = data.columns

# Print all columns
for column in columns:
    print(column)

#data consitency

age
fnlwgt
educational-num
capital-gain
capital-loss
hours-per-week
income
workclass_Federal-gov
workclass_Local-gov
workclass_Private
workclass_Self-emp-inc
workclass_Self-emp-not-inc
workclass_State-gov
workclass_Without-pay
education_10th
education_11th
education_12th
education_1st-4th
education_5th-6th
education_7th-8th
education_9th
education_Assoc-acdm
education_Assoc-voc
education_Bachelors
education_Doctorate
education_HS-grad
education_Masters
education_Preschool
education_Prof-school
education_Some-college
marital-status_Divorced
marital-status_Married-AF-spouse
marital-status_Married-civ-spouse
marital-status_Married-spouse-absent
marital-status_Never-married
marital-status_Separated
marital-status_Widowed
occupation_Adm-clerical
occupation_Armed-Forces
occupation_Craft-repair
occupation_Exec-managerial
occupation_Farming-fishing
occupation_Handlers-cleaners
occupation_Machine-op-inspct
occupation_Other-service
occupation_Priv-house-serv
occupation_Prof-specialty
occupation_

In [17]:
for column in columns:
    if "race" in column:
        print(column)

race_Amer-Indian-Eskimo
race_Asian-Pac-Islander
race_Black
race_Other
race_White


In [18]:
data.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.109589,226802,7,-0.146811,-0.218899,0.397959,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.287671,89814,9,-0.146811,-0.218899,0.5,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.150685,336951,12,-0.146811,-0.218899,0.397959,>50K,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0.369863,160323,10,0.876868,-0.218899,0.397959,>50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,0.232877,198693,6,-0.146811,-0.218899,0.295918,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [19]:
# Checking for unrealistic values in a column, data accuracy
data = data[data['age'] >= 0]


In [20]:
data = data[data['capital-loss'] >= 0] #data accuracy


In [21]:
data = data[data['hours-per-week'] >= 0] #data accuracy


### Dimension reduction
Remove all columns where country is not United States and remove married status.
FIT and Trasnform using SKlearn


In [22]:
from sklearn.decomposition import PCA

# Select the features you want to include in PCA
# Remove all native countries except United States

features_for_pca = ["age", "capital-gain", "capital-loss", "hours-per-week", "workclass_Federal-gov", "workclass_Local-gov",
                    "workclass_Private", "workclass_Self-emp-inc", "workclass_Self-emp-not-inc", "workclass_State-gov", "workclass_Without-pay",
                    "education_10th", "education_11th", "education_12th", "education_1st-4th", "education_5th-6th", "education_7th-8th", "education_9th",
                    "education_Assoc-acdm", "education_Assoc-voc", "education_Bachelors", "education_Doctorate", "education_HS-grad", "education_Masters",
                    "education_Preschool", "education_Prof-school", "education_Some-college", "occupation_Adm-clerical", "occupation_Armed-Forces",
                    "occupation_Craft-repair", "occupation_Exec-managerial", "occupation_Farming-fishing", "occupation_Handlers-cleaners",
                    "occupation_Machine-op-inspct", "occupation_Other-service", "occupation_Priv-house-serv", "occupation_Prof-specialty",
                    "occupation_Protective-serv", "occupation_Sales", "occupation_Tech-support", "occupation_Transport-moving", "gender_Female",
                    "race_Amer-Indian-Eskimo",
                    "race_Asian-Pac-Islander",
                    "race_Black",
                    "race_Other",
                    "race_White",
                    "gender_Male","native-country_United-States"]



print(features_for_pca)

# Create a DataFrame with the selected features
pca_data = data[features_for_pca]

# Initialize PCA with the number of components you want to retain
#n_components = 84  # You can choose a different number
n_components = features_for_pca.__len__()
pca = PCA(n_components=n_components)

# Fit and transform the data
pca_result = pca.fit_transform(pca_data)


print(pca_result)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(data=pca_result, columns=[f'{features_for_pca[i]}' for i in range(n_components)])


# Now 'pca_df' contains the reduced-dimensionality data after PCA
print(pca_df['age'])  # Print the first few rows of the PCA DataFrame

['age', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'education_10th', 'education_11th', 'education_12th', 'education_1st-4th', 'education_5th-6th', 'education_7th-8th', 'education_9th', 'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate', 'education_HS-grad', 'education_Masters', 'education_Preschool', 'education_Prof-school', 'education_Some-college', 'occupation_Adm-clerical', 'occupation_Armed-Forces', 'occupation_Craft-repair', 'occupation_Exec-managerial', 'occupation_Farming-fishing', 'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct', 'occupation_Other-service', 'occupation_Priv-house-serv', 'occupation_Prof-specialty', 'occupation_Protective-serv', 'occupation_Sales', 'occupation_Tech-support', 'occupation_Transport-moving', 'gender_Female', 'race_Ame

In [23]:
pca_df.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,occupation_Tech-support,occupation_Transport-moving,gender_Female,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male,native-country_United-States
0,-0.636717,0.99797,-0.33436,-0.18589,0.821504,-0.160372,0.046733,-0.269996,-0.159793,0.074109,...,-2.9e-05,-0.007739,-0.000447,-0.001312,-5.424911e-15,-9.957278000000001e-17,6.340638e-17,2.060853e-16,3.871218e-16,1.651547e-16
1,-0.013942,-0.263205,-0.327103,-0.216491,0.790468,-0.399502,0.052135,-0.093723,-0.112637,-0.152396,...,0.009106,-0.009024,0.002293,-0.001423,3.611411e-15,4.520915e-16,2.151249e-16,-1.324748e-15,-3.284952e-16,1.574775e-15
2,1.407947,0.134407,0.112831,-0.092244,-0.228435,-0.923784,-0.634545,-0.017052,0.36955,-0.021126,...,0.000823,0.000578,0.001892,-0.003664,-1.416422e-15,7.187407e-16,2.077063e-16,2.478027e-16,-5.3137160000000005e-17,6.095632e-17
3,0.184825,-0.510406,0.093973,0.543171,-0.101474,1.046079,-0.410031,-0.05536,0.263983,-0.204525,...,-0.001178,0.002624,-0.002037,0.00161,1.228659e-15,6.986948e-16,-1.304728e-15,-4.089404e-16,2.403314e-17,-1.106451e-16
4,0.123262,-0.187565,0.65898,-0.027964,-0.165276,0.312132,0.553066,-0.014785,-0.553129,-0.711639,...,0.008502,-0.000984,-0.000331,0.000174,2.786587e-15,-4.050452e-16,1.769467e-16,2.4545980000000002e-17,1.411564e-16,-2.6933240000000003e-17


In [24]:
# Save the dataset
pca_df.to_csv('ReadyForModelling.csv', index=False)