In [312]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [313]:
data = pd.read_csv('adult.csv')

In [314]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [315]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [316]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [317]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
48841,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [318]:
data.isnull().sum() #this shows that there are no null values in the data

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [319]:
data['occupation'].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', '?',
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [320]:
data['occupation'].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [321]:
# Removing duplicate rows
data.drop_duplicates(inplace=True)


In [322]:
data.replace('?', np.nan, inplace=True)

# Iterate through columns
for column in data.columns:
    if data[column].dtype == 'object':
        # For non-numeric columns, remove rows with missing values
        data = data.dropna(subset=[column])
    else:
        # For numeric columns, fill missing values with the mean
        data[column].fillna(data[column].mean(), inplace=True)
#since our data had a lot of cells labelled as '?' we replace that too since its undefined data, data completeness

In [323]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [324]:
#Here we are Encoding the Categorical Variables (One-Hot Encoding)
data = pd.get_dummies(data, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])


In [325]:
# Normalization (MinMax Scaling) for 'Age' and 'Hours per Week'
# scaler = MinMaxScaler()
# data[['age', 'hours-per-week']] = scaler.fit_transform(data[['age', 'hours-per-week']])


In [326]:
# Standardization (Z-score Scaling) for 'Capital Gain' and 'Capital Loss'
scaler = StandardScaler()
data[['capital-gain', 'capital-loss']] = scaler.fit_transform(data[['capital-gain', 'capital-loss']])

# Now, our data is transformed, normalized, and standardized


In [327]:
columns = data.columns

# Print all columns
for column in columns:
    print(column)

#data consitency

age
fnlwgt
educational-num
capital-gain
capital-loss
hours-per-week
income
workclass_Federal-gov
workclass_Local-gov
workclass_Private
workclass_Self-emp-inc
workclass_Self-emp-not-inc
workclass_State-gov
workclass_Without-pay
education_10th
education_11th
education_12th
education_1st-4th
education_5th-6th
education_7th-8th
education_9th
education_Assoc-acdm
education_Assoc-voc
education_Bachelors
education_Doctorate
education_HS-grad
education_Masters
education_Preschool
education_Prof-school
education_Some-college
marital-status_Divorced
marital-status_Married-AF-spouse
marital-status_Married-civ-spouse
marital-status_Married-spouse-absent
marital-status_Never-married
marital-status_Separated
marital-status_Widowed
occupation_Adm-clerical
occupation_Armed-Forces
occupation_Craft-repair
occupation_Exec-managerial
occupation_Farming-fishing
occupation_Handlers-cleaners
occupation_Machine-op-inspct
occupation_Other-service
occupation_Priv-house-serv
occupation_Prof-specialty
occupation_

In [328]:
for column in columns:
    if "race" in column:
        print(column)

race_Amer-Indian-Eskimo
race_Asian-Pac-Islander
race_Black
race_Other
race_White


In [329]:
data.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,-0.146811,-0.218899,40,<=50K,False,False,True,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,-0.146811,-0.218899,50,<=50K,False,False,True,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,-0.146811,-0.218899,40,>50K,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,0.876868,-0.218899,40,>50K,False,False,True,...,False,False,False,False,False,False,False,True,False,False
5,34,198693,6,-0.146811,-0.218899,30,<=50K,False,False,True,...,False,False,False,False,False,False,False,True,False,False


In [330]:
# Checking for unrealistic values in a column, data accuracy
data = data[data['age'] >= 0]


In [331]:
# data = data[data['capital-loss'] >= 0] #data accuracy


In [332]:
data = data[data['hours-per-week'] >= 0] #data accuracy


In [333]:
# Remove useless stuff
data = data.drop(['fnlwgt', 'educational-num'], axis=1)

In [334]:
data.head()


Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,-0.146811,-0.218899,40,<=50K,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,-0.146811,-0.218899,50,<=50K,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,-0.146811,-0.218899,40,>50K,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,44,0.876868,-0.218899,40,>50K,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
5,34,-0.146811,-0.218899,30,<=50K,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [335]:
# Save the dataset
data.to_csv('ReadyForModelling_OG.csv', index=False)

### Dimension reduction
Remove all columns where country is not United States and remove married status.
FIT and Trasnform using SKlearn


In [336]:
from sklearn.decomposition import PCA

# Select the features you want to include in PCA
# Remove all native countries except United States

features_for_pca = ["age", "hours-per-week", "workclass_Federal-gov", "workclass_Local-gov",
                    "workclass_Private", "workclass_Self-emp-inc", "workclass_Self-emp-not-inc", "workclass_State-gov", "workclass_Without-pay",
                    "education_10th", "education_11th", "education_12th", "education_1st-4th", "education_5th-6th", "education_7th-8th", "education_9th",
                    "education_Assoc-acdm", "education_Assoc-voc", "education_Bachelors", "education_Doctorate", "education_HS-grad", "education_Masters",
                    "education_Preschool", "education_Prof-school", "education_Some-college", "occupation_Adm-clerical", "occupation_Armed-Forces",
                    "occupation_Craft-repair", "occupation_Exec-managerial", "occupation_Farming-fishing", "occupation_Handlers-cleaners",
                    "occupation_Machine-op-inspct", "occupation_Other-service", "occupation_Priv-house-serv", "occupation_Prof-specialty",
                    "occupation_Protective-serv", "occupation_Sales", "occupation_Tech-support", "occupation_Transport-moving", "gender_Female",
                    "race_Amer-Indian-Eskimo",
                    "race_Asian-Pac-Islander",
                    "race_Black",
                    "race_Other",
                    "race_White",
                    "gender_Male","native-country_United-States"]



print(features_for_pca)

# Create a DataFrame with the selected features
pca_data = data[features_for_pca]

# Initialize PCA with the number of components you want to retain
#n_components = 84  # You can choose a different number
n_components = features_for_pca.__len__()
pca = PCA(n_components=n_components)

# Fit and transform the data
pca_result = pca.fit_transform(pca_data)


print(pca_result)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(data=pca_result, columns=[f'{features_for_pca[i]}' for i in range(n_components)])


# Now 'pca_df' contains the reduced-dimensionality data after PCA
print(pca_df['age'])  # Print the first few rows of the PCA DataFrame

['age', 'hours-per-week', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'education_10th', 'education_11th', 'education_12th', 'education_1st-4th', 'education_5th-6th', 'education_7th-8th', 'education_9th', 'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate', 'education_HS-grad', 'education_Masters', 'education_Preschool', 'education_Prof-school', 'education_Some-college', 'occupation_Adm-clerical', 'occupation_Armed-Forces', 'occupation_Craft-repair', 'occupation_Exec-managerial', 'occupation_Farming-fishing', 'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct', 'occupation_Other-service', 'occupation_Priv-house-serv', 'occupation_Prof-specialty', 'occupation_Protective-serv', 'occupation_Sales', 'occupation_Tech-support', 'occupation_Transport-moving', 'gender_Female', 'race_Amer-Indian-Eskimo', 'race_Asian-Pa

In [337]:
pca_df.head()

Unnamed: 0,age,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,occupation_Tech-support,occupation_Transport-moving,gender_Female,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male,native-country_United-States
0,-12.821508,4.507422,-0.371684,0.099345,-0.212233,0.220428,1.288785,-0.011322,0.030645,-0.057841,...,-0.006278,-0.003823,-0.003554,0.00068,-3e-06,-2.70515e-15,3.436386e-15,1.405627e-14,-1.753809e-14,1.598425e-14
1,3.079952,8.541925,-0.408739,0.726707,0.037753,-0.074256,-0.161339,-0.007853,0.030762,-0.032715,...,-0.018166,-0.015241,-0.008616,-0.005985,-0.000901,4.748977e-16,-3.284986e-15,3.842325e-15,-1.637015e-14,1.806738e-14
2,-10.055062,3.321547,-0.511341,-0.541845,0.758273,0.293498,-0.111608,-0.053043,-0.061973,-0.02582,...,-0.012436,-0.001908,-0.002165,-0.001191,-0.000629,-1.435506e-13,-4.981461e-16,-9.181779e-16,1.817067e-16,-1.461438e-15
3,4.620907,-3.020054,-0.241089,-0.144611,-0.784116,0.798772,1.104555,0.033704,0.058187,0.030589,...,-0.002487,-0.009599,-0.00364,-0.000142,-1.5e-05,5.994666e-13,1.728836e-15,4.878508e-16,-7.327313e-15,1.012066e-14
4,-8.519213,-8.240145,-0.551214,-0.030119,-0.244617,-0.155715,0.052097,-0.112363,-0.075561,-0.586935,...,0.005283,-0.011093,-0.004128,0.000433,-0.000167,1.096233e-12,7.518276e-15,-3.088005e-16,2.263917e-15,-4.217668e-15


In [338]:
# Save the dataset
pca_df.to_csv('ReadyForModelling.csv', index=False)