In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [3]:
df  = pd.read_csv("datasets/adult.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
## finding names of all columns
col_list  = list(df.columns)
col_list

In [None]:
# finding the datatype of each column
(df.dtypes)

In [None]:
# finding numerical and categorical columns
categorical_columns = []
numerical_columns = []


for column in df.columns:
    if df[column].dtype == 'object':  # Check if data type is object (categorical)
        categorical_columns.append(column)
    else:  # Otherwise, consider it as numerical
        numerical_columns.append(column)

In [None]:
categorical_columns

In [None]:
numerical_columns

In [None]:
print(f"The number of catgeorical columns in the dataset is {len(categorical_columns)}")

In [None]:
print(f"The number of numerical columns in the dataset is {len(numerical_columns)}")

In [None]:
df[categorical_columns].head()

In [None]:
df['education'].unique()

In [None]:
# there is an extra space in the values of categorical columns, so removing it
for col in categorical_columns:
    df[col] = df[col].str.strip()

In [None]:
# Check if all categorical columns have no extra spaces
for column in categorical_columns:
    assert all(df[column].str.strip() == df[column]), f"Column '{column}' has extra spaces."

print("All categorical columns have no extra spaces.")

In [None]:
fig, ax  = plt.subplots(figsize=(5,3))
sns.heatmap(df.isnull(), yticklabels=False, ax= ax)
plt.title('Missing Values Heatmap');

## There are no missing values present in the dataset

### Analysing and handling each feature

## 1.Output Feature "salary"

In [None]:
# encoding the classes in output feature
df.loc[df['salary']=='<=50K','salary'] = 0
df.loc[df['salary']=='>50K','salary'] = 1

In [None]:
df['salary'].unique()

In [None]:
df['salary'].value_counts()

## 2.Age Feature

In [None]:
df['age'].describe()

In [None]:
# checking the distrubution of age column
sns.displot((df['age']), kde= True);

The age data is not normally distributed so we have to handle the outliers present

In [None]:
df['age'].median()

In [None]:
df['age'].mean()

In [None]:
(df['age'] > 70).value_counts()

In [None]:
transformed_age, lambda_value = stats.boxcox(df['age'])

In [None]:
sns.displot(transformed_age, kde= True, color="black");

In [None]:
df['age_transformed'] = transformed_age

In [None]:
df['age_transformed'].describe()

In [None]:
sns.boxplot(df['age_transformed'], orient="h",color="black")
plt.title("Boxplot of the 'age'");

In [None]:
# columns
df.columns

### 2.Workclass column

In [None]:
plt.figure(figsize=(15,4))
sns.histplot(data=df['workclass'], x=df['workclass'], element="bars",kde=True, color = "purple");

In [None]:
df['workclass'].value_counts()


In [None]:
df['workclass'].unique()

In [None]:
(df['workclass'] == '?').sum()

In [None]:
df.loc[df['workclass'] == '?','education'].value_counts()

In [None]:
df['workclass'].mode()[0]

In [None]:
df['education'].unique()

In [None]:
df.loc[
        (df['workclass'] == '?')
        &
            (
                
                (df['education'] == 'Preschool')
        |       (df['education'] == '1st-4th')
        |       (df['education'] == '5th-6th')
            ),
        'workclass'
       ] = 'Never-worked'

In [None]:
df.loc[
        (df['workclass'] == '?')
    
        &
    
            (
                (df['education'] == '7th-8th')
                |
                (df['education'] == '9th')
                |
                (df['education'] == '10th')
                |
                (df['education'] == '11th')
                |
                (df['education'] == '12th')
            )
        ,
        'workclass'
       ] = 'Never-worked'

In [None]:
df['workclass'].value_counts()

In [None]:
df.loc[
        (df['workclass'] == '?')
    
        &
    
            (
                (df['education'] == 'HS-grad')
                |
                (df['education'] == 'Some-college')
            )
        ,
        'workclass'
       ] = 'Without-pay'

In [None]:
df['workclass'].value_counts()

In [None]:
df.loc[
        (df['workclass'] == '?')
    
        &
    
            (
                (df['education'] == 'Bachelors' )
                |
                (df['education'] == 'Assoc-acdm')
                |
                (df['education'] ==  'Assoc-voc')
            )
        ,
        'workclass'
       ] = 'Private'

In [None]:
df['workclass'].value_counts()

In [None]:
df.loc[df['workclass'] == '?','education'].value_counts().plot(kind="bar", color="black");

In [None]:
## occupation based on educational degrees
df.loc[
        (df['workclass'] != '?')
        &
            (
                
                (df['education'] == 'Prof-school')
                |
                (df['education'] == 'Masters')
                |
                (df['education'] == 'Doctorate')
            )
        
    ,
    'occupation'].value_counts()

In [None]:
## workclass according to diff occupations
df.loc[
        (df['workclass'] != '?')
        &
            (
                
                (df['occupation'] == 'Prof-specialty')
                |
                (df['occupation'] == 'Exec-managerial')
            )
        
    ,
    'workclass'].value_counts()

## This shows that most of prof speciality and exec manegerial are in private jobs


In [None]:
df.loc[
        (df['workclass'] == '?')
        &
            (
                (df['education'] == 'Prof-school')
                |
                (df['education'] == 'Masters')
                |
                (df['education'] == 'Doctorate')   
            )
        
    ,
    'workclass'] = 'Private'

In [None]:
df['workclass'].value_counts()

In [None]:
df['workclass'].value_counts(normalize=True).plot(kind="bar", color="black");

In [None]:
df1  = df.copy()

In [None]:
df1.columns

### 3. fnlwgt column

In [None]:
df1['fnlwgt'].describe()

In [None]:
(df1['fnlwgt']).hist(color="black")

In [None]:
sns.boxplot(df1['fnlwgt'],orient="h")

In [None]:
sns.displot(np.log(df1['fnlwgt']), kde=True, bins=15)
plt.title("Distribution of 'fnlwgt' after log transformation");

In [None]:
df1['transformed_fnlwgt' ] = np.log(df1['fnlwgt'])

In [None]:
df1.shape

### 4.Education column


In [None]:
df1['education'].value_counts()

In [None]:
df1['education'].unique()

### 5.Education-num column

In [None]:
df1['education-num'].unique()

In [None]:
df1['education-num'].value_counts()

In [None]:
# wecan drop the education column as education num has same info regarding education
# df1 = df1.drop(columns="education")

In [None]:
df1.columns

### 6.marital status

In [None]:
df1['marital-status'].value_counts()

In [None]:
df1['marital-status'].unique()

In [None]:
df1['marital-status'].value_counts(normalize=True).plot(kind = "bar", color="black");

In [None]:
grouped = df1.groupby('salary')['marital-status'].value_counts().unstack()

# Plot the side-by-side bar chart
custom_colors = ['#FF0000', '#00FF00', '#0000FF', '#FF00FF', '#00FFFF', '#FFFF00', '#FFA500']
ax = grouped.plot(kind='bar', stacked=False, color=custom_colors)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.title('Count of Marital Status by Salary Category');


In [None]:
df2  = df1.copy()

### 7.Occupation column

In [None]:
df2['occupation'].value_counts()

In [None]:
df2['occupation'].unique()

### now the task is to replace the "?" values in the ocuupation column

In [None]:
df2.loc[df2['occupation'] == '?','education'].value_counts()

### The rows where occupation has "?" in it, maximum of them are of younger persons in school or passed the college, or in college

In [None]:
df2.loc[df2['occupation'] == '?','salary'].value_counts()

### This tells that the rows where occupation has '?' in it , has maximum of them salaries less than 50k

In [None]:
df2.loc[(df2['occupation'] == 'Handlers-cleaners'), 'education'].value_counts() 

In [None]:
df2.loc[(df2['occupation'] == 'Farming-fishing'), 'education'].value_counts() 

In [None]:
df2.loc[(df2['occupation'] == 'Farming-fishing'), 'education'].value_counts() 

In [None]:
df2.loc[(df2['occupation'] == 'Transport-moving'), 'education'].value_counts() 

In [None]:
df2.loc[(df2['occupation'] == 'Craft-repair'), 'education'].value_counts() 

In [None]:
df2.loc[df2['education'] == 'HS-grad','occupation'].value_counts()

In [None]:
df2.loc[(df2['education'] == 'HS-grad') & (df2['occupation'] == "?"), 'occupation'] = 'Craft-repair'

In [None]:
df2.loc[df2['education'] == 'Some-college','occupation'].value_counts()

In [None]:
df2.loc[(df2['education'] == 'Some-college') & (df2['occupation'] == "?"), 'occupation'] = 'Adm-clerical'

In [None]:
df2.loc[df2['education'] == 'Bachelors', 'occupation'].value_counts()

In [None]:
df2.loc[(df2['education'] == 'Bachelors') & (df2['occupation'] == "?"), 'occupation'] = 'Prof-specialty'

In [None]:
df2.loc[df2['occupation'] == '?', 'education'].value_counts()

In [None]:
df2.loc[(df2['education'] == "11th") | (df2['education'] == "12th") | (df2['education'] == '9th') | (df2['education'] == '10th') | (df2['education'] == '7th-8th') | (df2['education'] == '5th-6th') , 'occupation'].value_counts() 

In [None]:
df2.loc[((df2['education'] == "11th") | (df2['education'] == "12th") | (df2['education'] == '9th') | (df2['education'] == '10th') | (df2['education'] == '7th-8th') | (df2['education'] == '5th-6th')) & (df2['occupation'] == '?'), 'occupation'] = 'Other-service' 

In [None]:
df2['occupation'].value_counts()

In [None]:
df2.loc[df2['occupation'] == '?', 'education'].value_counts()

In [None]:
df2.loc[(df2['education'] == 'Assoc-acdm'), 'occupation'].value_counts()

In [None]:
df2.loc[(df2['education'] == 'Assoc-voc') & (df2['occupation'] == '?'), 'occupation'] = 'Craft-repair'
df2.loc[((df2['education'] == 'Masters') | (df2['education'] == 'Prof-school') | (df2['education'] == 'Doctorate')) & (df2['occupation'] == '?') , 'occupation'] = 'Prof-specialty'
df2.loc[(df2['education'] == 'Assoc-acdm') & (df2['occupation'] == '?'),'occupation'] = 'Adm-clerical'
df2.loc[((df2['education'] == '1st-4th') | (df2['education'] == 'Preschool')) & (df2['occupation'] == '?'), 'occupation'] = 'Other-service'

In [None]:
df2['occupation'].value_counts()

In [None]:
df2['occupation'].value_counts().plot(kind= "bar", color="black")
plt.title("Distribution of ocuupation");

In [None]:
(df2['occupation'].unique())

In [None]:
# ## encoding the occupation column
# df2['occupation'] = df2['occupation'].replace({'Priv-house-serv':0,'Handlers-cleaners':1,'Farming-fishing':2,
#                                             'Other-service':3,'Adm-clerical':4,'Craft-repair':5, 'Machine-op-inspct':6,
#                                             'Tech-support':7, 'Sales':8, 'Transport-moving':9, 'Protective-serv':10,
#                                              'Prof-specialty':11, 'Exec-managerial':12, 'Armed-Forces':13
#                                             })

In [None]:
df2['occupation'].dtype

# finally the ocuupation column is cleaned 

## 8.Relationship

In [None]:
df2['relationship'].unique()

In [None]:
df2['relationship'].value_counts()

In [None]:
df2['relationship'].value_counts().plot(kind= "bar")

In [None]:
grouped = df2.groupby('salary')['relationship'].value_counts().unstack()

# Plot the side-by-side bar chart
ax = grouped.plot(kind='bar', stacked=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Relationship')
plt.ylabel('Count')
plt.title('Count of Relationship by Salary Category');


##  9.Capital-gain

In [None]:
df2['capital-gain'].describe()

In [None]:
sns.displot(df2['capital-gain'], kde= True);

In [None]:
sns.boxplot(df2['capital-gain'], orient="h");

In [None]:
## droping the feature 
df2  = df2.drop(columns='capital-gain')

In [None]:
df2.shape

### 10.capital-loss column

In [None]:
df2['capital-loss'].describe()

In [None]:
sns.boxplot((df2['capital-loss']), orient="h")

In [None]:
## dropping the column
df2 = df2.drop(columns='capital-loss')

In [None]:
df2.shape

### 11. Hours per week

In [None]:
df2['hours-per-week'].describe()

In [None]:
sns.boxplot(df2['hours-per-week'], orient="h")


In [None]:
df2.loc[df2['hours-per-week'] < 5, 'education']

In [None]:
df2['hours-per-week'].hist()

In [None]:
sns.boxplot(df2['hours-per-week'], orient="h")

In [None]:
df2['hours-per-week'].describe()

### 12.Country

In [None]:
df2['country'].unique()

In [None]:
df2['country']  = df2['country'].str.strip()

In [None]:
df2['country'].unique()

In [None]:
df2['country'].value_counts(normalize=True).plot(kind = "bar")

# Maximum people are from united states

In [None]:
df2['country'].value_counts()

In [None]:
df2['country'].unique()

In [None]:
df2['country'] = df2['country'].replace("?","United-States")

In [None]:
df2['country'].unique()

In [None]:
## dropping the country column from the dataset
df2  = df2.drop(columns = 'country')

## 13.Race

In [None]:
df2['race'].unique()

In [None]:
df2['race'] = df2['race'].str.strip()

In [None]:
df2['race'].value_counts()

In [None]:
df2['race'].value_counts().plot(kind = "bar");

## most people in the dataset are white

In [None]:
grouped = df2.groupby('salary')['race'].value_counts().unstack()

# Plot the side-by-side bar chart
ax = grouped.plot(kind='bar', stacked=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Race')
plt.ylabel('Count')
plt.title('Count of Race by Salary Category');


## 14. SEX

In [None]:
df2['sex'].unique()

In [None]:
df2['sex'] = df2['sex'].str.strip()

In [None]:
df2['sex'].value_counts()

In [None]:
grouped = df2.groupby('salary')['sex'].value_counts().unstack()

# Plot the side-by-side bar chart
ax = grouped.plot(kind='bar', stacked=False)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));

In [None]:
df2['salary'].value_counts().plot(kind= "bar", color = "black")
plt.title("Salary classes and their distribution");

### 0 represents the class with salary less than 50k 
### 1 represents class with more than 50k salary

# Correlations among the columns

In [None]:
df2.corr()

In [None]:
sns.heatmap(df2.corr(), annot=True);

## mean age of peoples from different salary classes

In [None]:
d1  = df2[["age", "salary"]].groupby(["salary"], as_index=False).mean().rename(columns={"age": "mean age"})
d1


In [None]:
d2 = df2[["age_transformed", "salary"]].groupby(["salary"], as_index=False).mean().rename(columns={"age_transformed": "mean age_transformed"})
d2

## This shows that persons with less than 50k salary are younger than persons with more than 50k salary, which is quite obvious becoz as exp increases in jobs, salary also increases

In [None]:
sns.barplot(x="salary", y="mean age", data = d1)
plt.title("distribution of different salary classes by mean age");

In [None]:
df2.columns

In [None]:
df2.shape

In [None]:
plt.scatter(x=df2['salary'], y=df2['age']);
plt.ylabel("age")
plt.xlabel("salary")

In [None]:
df2['age'].corr(df['age_transformed'])

In [None]:
df2.shape

In [None]:
df2.columns

In [None]:
df2['race'].value_counts()

In [None]:
df2.drop(columns="race", inplace=True)

In [None]:
df2.columns

In [None]:
df2.drop(columns=["age_transformed", "transformed_fnlwgt"], inplace=True)

In [None]:
df2.columns

In [None]:
df2.shape

In [None]:
df2.drop(columns="education-num", inplace = True)

In [None]:
df2.head()

In [None]:
df2.shape

In [4]:
cleaned_df  = df2.to_csv("datasets/cleaned_datasets/cleaned_phase1")

NameError: name 'df2' is not defined