In [14]:
import pandas as pd

# Load dataset
df = pd.read_csv('job_placement_data.csv')

# 1. Rename columns
df.rename(columns={
    'college_name': 'college',
    'years_of_experience': 'experience'
}, inplace=True)

# 2. Remove duplicates
df.drop_duplicates(inplace=True)

# 3. Standardize text fields
df['gender'] = df['gender'].str.strip().str.lower().replace({'m': 'male', 'f': 'female'})

# 4. Clean column names (lowercase + underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 5. Fill missing 'age' values using grouped mode by 'experience'
df['age'] = df.groupby('experience')['age'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x.median())
)

# 6. Final fallback for any remaining NaNs in age
df['age'].fillna(df['age'].median(), inplace=True)

# 7. Fix data types
df['age'] = df['age'].astype('int')
df['salary'] = df['salary'].astype('int')
df['gpa'] = df['gpa'].astype('float')
df['experience'] = df['experience'].astype('float')

# 8. Save cleaned dataset
df.to_csv('job_placement_data_cleaned.csv', index=False)

print("✅ Data cleaning complete. File saved as 'job_placement_data_cleaned.csv'")


✅ Data cleaning complete. File saved as 'job_placement_data_cleaned.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)


In [15]:
d = pd.read_csv('job_placement_data.csv')

d.head()

Unnamed: 0,id,name,gender,age,degree,stream,college_name,placement_status,salary,gpa,years_of_experience,skills
0,1,John Doe,Male,25.0,Bachelor's,Computer Science,Harvard University,Placed,60000,3.7,2.0,"Machine Learning, AI, Deep Learning"
1,2,Jane Smith,Female,24.0,Bachelor's,Electrical Engineering,Massachusetts Institute of Technology,Placed,65000,3.6,1.0,"Networking, Cyber Security, Linux"
2,3,Michael Johnson,Male,26.0,Bachelor's,Mechanical Engineering,Stanford University,Placed,58000,3.8,3.0,"Networking, Cyber Security, Linux"
3,4,Emily Davis,Female,23.0,Bachelor's,Information Technology,Yale University,Not Placed,0,3.5,2.0,"Python, SQL, Data Analysis"
4,5,David Brown,Male,24.0,Bachelor's,Computer Science,Princeton University,Placed,62000,3.9,2.0,"Python, SQL, Data Analysis"


In [16]:
df.head()

Unnamed: 0,id,name,gender,age,degree,stream,college,placement_status,salary,gpa,experience,skills
0,1,John Doe,male,25,Bachelor's,Computer Science,Harvard University,Placed,60000,3.7,2.0,"Machine Learning, AI, Deep Learning"
1,2,Jane Smith,female,24,Bachelor's,Electrical Engineering,Massachusetts Institute of Technology,Placed,65000,3.6,1.0,"Networking, Cyber Security, Linux"
2,3,Michael Johnson,male,26,Bachelor's,Mechanical Engineering,Stanford University,Placed,58000,3.8,3.0,"Networking, Cyber Security, Linux"
3,4,Emily Davis,female,23,Bachelor's,Information Technology,Yale University,Not Placed,0,3.5,2.0,"Python, SQL, Data Analysis"
4,5,David Brown,male,24,Bachelor's,Computer Science,Princeton University,Placed,62000,3.9,2.0,"Python, SQL, Data Analysis"


In [17]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                700 non-null    int64  
 1   name              700 non-null    object 
 2   gender            700 non-null    object 
 3   age               700 non-null    int64  
 4   degree            700 non-null    object 
 5   stream            700 non-null    object 
 6   college           700 non-null    object 
 7   placement_status  700 non-null    object 
 8   salary            700 non-null    int64  
 9   gpa               700 non-null    float64
 10  experience        699 non-null    float64
 11  skills            700 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 65.8+ KB


In [18]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   700 non-null    int64  
 1   name                 700 non-null    object 
 2   gender               700 non-null    object 
 3   age                  630 non-null    float64
 4   degree               700 non-null    object 
 5   stream               700 non-null    object 
 6   college_name         700 non-null    object 
 7   placement_status     700 non-null    object 
 8   salary               700 non-null    int64  
 9   gpa                  700 non-null    float64
 10  years_of_experience  699 non-null    float64
 11  skills               700 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 65.8+ KB
