<a href="https://colab.research.google.com/github/imranmohd98611-ctrl/fair-salary-prediction/blob/main/fair-salary-prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1 — Upload archive.zip from your computer
from google.colab import files
uploaded = files.upload()  # pick archive.zip when the file dialog opens

# If your file name is different, note the exact uploaded filename shown in `uploaded`
print("Uploaded files:", list(uploaded.keys()))

Saving archive.zip to archive.zip
Uploaded files: ['archive.zip']


In [2]:
# Cell 2 — Unzip the uploaded file into a folder named "unzipped_data"
import zipfile, os

zip_filename = None
# try to find a .zip in the uploaded files
for fname in uploaded.keys():
    if fname.lower().endswith('.zip'):
        zip_filename = fname
        break

if zip_filename is None:
    raise FileNotFoundError("No .zip file found in uploaded files. Make sure you uploaded archive.zip.")

extract_dir = "unzipped_data"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_filename, 'r') as z:
    z.extractall(extract_dir)

print(f"Extracted '{zip_filename}' to folder: {extract_dir}")
print("Extracted items:", os.listdir(extract_dir))

Extracted 'archive.zip' to folder: unzipped_data
Extracted items: ['adult.csv']


In [4]:
# Cell 3 — List all files (recursively) inside the extracted folder
import os

for root, dirs, files in os.walk(extract_dir):
    level = root.replace(extract_dir, "").count(os.sep)
    indent = "  " * level
    print(f"{indent}{os.path.basename(root)}/")
    for f in files:
        print(f"{indent}  - {f}")

unzipped_data/
  - adult.csv


In [5]:
# Cell 4 — Automatically find the first CSV (or show options if multiple) and load it with pandas
import pandas as pd
import os

csv_files = [f for f in os.listdir(extract_dir) if f.lower().endswith('.csv')]

if not csv_files:
    # look deeper (subfolders)
    for root, dirs, files in os.walk(extract_dir):
        for f in files:
            if f.lower().endswith('.csv'):
                csv_files.append(os.path.join(root, f))

if not csv_files:
    raise FileNotFoundError("No CSV files found in the extracted ZIP. Check the extracted content above.")

print("CSV files found:")
for i, f in enumerate(csv_files):
    print(i, f)

# Auto-select the first by default
selected_csv = csv_files[0]
print(f"\nLoading: {selected_csv}")

# If path is nested, ensure we use full path
csv_path = selected_csv if os.path.isabs(selected_csv) else os.path.join(extract_dir, selected_csv)
df = pd.read_csv(csv_path)

print("Dataset loaded. Use df.head() to preview.")

CSV files found:
0 adult.csv

Loading: adult.csv
Dataset loaded. Use df.head() to preview.


In [6]:
# Cell 5 — Preview and basic info
# Show first 10 rows
print("First 10 rows:")
display(df.head(10))

# Show shape, columns, dtypes
print("\nShape:", df.shape)
print("\nColumns and dtypes:")
print(df.dtypes)

# Quick summary
print("\nInfo:")
df.info()

# Describe numeric columns
print("\nNumeric summary (describe):")
display(df.describe(include='number').T)

# Missing values summary
print("\nMissing values per column:")
print(df.isna().sum())

First 10 rows:


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K



Shape: (48842, 15)

Columns and dtypes:
age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-nu

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,48842.0,38.643585,13.71051,17.0,28.0,37.0,48.0,90.0
fnlwgt,48842.0,189664.134597,105604.025423,12285.0,117550.5,178144.5,237642.0,1490400.0
educational-num,48842.0,10.078089,2.570973,1.0,9.0,10.0,12.0,16.0
capital-gain,48842.0,1079.067626,7452.019058,0.0,0.0,0.0,0.0,99999.0
capital-loss,48842.0,87.502314,403.004552,0.0,0.0,0.0,0.0,4356.0
hours-per-week,48842.0,40.422382,12.391444,1.0,40.0,40.0,45.0,99.0



Missing values per column:
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [7]:
# Cell 6 — If dataset is large: show a random sample and save a small preview CSV
print("Random sample (10 rows):")
display(df.sample(10))

# Save a small preview to the notebook filesystem
preview_path = "dataset_preview.csv"
df.head(100).to_csv(preview_path, index=False)
print(f"Saved preview (first 100 rows) to: {preview_path}")

Random sample (10 rows):


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
38089,22,Private,208946,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States,<=50K
13409,52,Self-emp-inc,230919,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
17846,47,Private,111797,Some-college,10,Never-married,Other-service,Not-in-family,Black,Female,0,0,35,Outlying-US(Guam-USVI-etc),<=50K
31418,30,Private,38848,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
20717,39,Private,185099,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K
2581,35,Local-gov,160728,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,>50K
39589,39,Private,284166,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,50,United-States,>50K
27911,22,Federal-gov,65547,Some-college,10,Never-married,Exec-managerial,Not-in-family,Black,Male,0,0,40,United-States,<=50K
37012,32,Private,317378,Bachelors,13,Never-married,Exec-managerial,Own-child,White,Female,10520,0,40,United-States,>50K
26499,52,Self-emp-not-inc,135339,Some-college,10,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K


Saved preview (first 100 rows) to: dataset_preview.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

