### Categorical variables encoding
## Dummy Variables & One-Hot Encoding
To represent categorical variable value with numerical variable for better use.
Dummy variables are binary indicators (0 or 1) created to represent categories in a dataset. Each category of a categorical variable is turned into a separate column, where:

1 means the category is present.
0 means the category is absent.

Why Use Dummy Variables?

Compatibility with Machine Learning: Many machine learning models need numerical data, so we convert categories into numbers.



In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ai_job_market_insights.csv')

In [3]:
df.head()

Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,High,UX/UI Design,111392.165243,Yes,Growth
1,Marketing Specialist,Technology,Large,Singapore,Medium,High,Marketing,93792.562466,No,Decline
2,AI Researcher,Technology,Large,Singapore,Medium,High,UX/UI Design,107170.263069,Yes,Growth
3,Sales Manager,Retail,Small,Berlin,Low,High,Project Management,93027.953758,No,Growth
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,Low,JavaScript,87752.922171,Yes,Decline


In [4]:
df.columns

Index(['Job_Title', 'Industry', 'Company_Size', 'Location',
       'AI_Adoption_Level', 'Automation_Risk', 'Required_Skills', 'Salary_USD',
       'Remote_Friendly', 'Job_Growth_Projection'],
      dtype='object')

In [5]:
df.isnull().sum()

Job_Title                0
Industry                 0
Company_Size             0
Location                 0
AI_Adoption_Level        0
Automation_Risk          0
Required_Skills          0
Salary_USD               0
Remote_Friendly          0
Job_Growth_Projection    0
dtype: int64

In [6]:
df['Job_Title'].value_counts()

Job_Title
Data Scientist           62
HR Manager               57
Cybersecurity Analyst    55
UX Designer              54
AI Researcher            51
Sales Manager            49
Marketing Specialist     48
Operations Manager       44
Software Engineer        41
Product Manager          39
Name: count, dtype: int64

In [7]:
df['Job_Title'].unique()

array(['Cybersecurity Analyst', 'Marketing Specialist', 'AI Researcher',
       'Sales Manager', 'UX Designer', 'HR Manager', 'Product Manager',
       'Software Engineer', 'Data Scientist', 'Operations Manager'],
      dtype=object)

In [8]:
dummy_df = pd.get_dummies(df)
dummy_df

Unnamed: 0,Salary_USD,Job_Title_AI Researcher,Job_Title_Cybersecurity Analyst,Job_Title_Data Scientist,Job_Title_HR Manager,Job_Title_Marketing Specialist,Job_Title_Operations Manager,Job_Title_Product Manager,Job_Title_Sales Manager,Job_Title_Software Engineer,...,Required_Skills_Marketing,Required_Skills_Project Management,Required_Skills_Python,Required_Skills_Sales,Required_Skills_UX/UI Design,Remote_Friendly_No,Remote_Friendly_Yes,Job_Growth_Projection_Decline,Job_Growth_Projection_Growth,Job_Growth_Projection_Stable
0,111392.165243,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,False
1,93792.562466,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,True,False,True,False,False
2,107170.263069,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,True,False
3,93027.953758,False,False,False,False,False,False,False,True,False,...,False,True,False,False,False,True,False,False,True,False
4,87752.922171,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,105821.394046,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
496,119794.992146,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,True,False,True,False,False
497,79644.933099,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,True
498,77642.150625,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,True


In [9]:
dummy_df.isnull().sum()

Salary_USD                            0
Job_Title_AI Researcher               0
Job_Title_Cybersecurity Analyst       0
Job_Title_Data Scientist              0
Job_Title_HR Manager                  0
Job_Title_Marketing Specialist        0
Job_Title_Operations Manager          0
Job_Title_Product Manager             0
Job_Title_Sales Manager               0
Job_Title_Software Engineer           0
Job_Title_UX Designer                 0
Industry_Education                    0
Industry_Energy                       0
Industry_Entertainment                0
Industry_Finance                      0
Industry_Healthcare                   0
Industry_Manufacturing                0
Industry_Retail                       0
Industry_Technology                   0
Industry_Telecommunications           0
Industry_Transportation               0
Company_Size_Large                    0
Company_Size_Medium                   0
Company_Size_Small                    0
Location_Berlin                       0


it will remove useless columns

In [13]:
dummy_df = pd.get_dummies(df, drop_first = True)
dummy_df

Unnamed: 0,Salary_USD,Job_Title_Cybersecurity Analyst,Job_Title_Data Scientist,Job_Title_HR Manager,Job_Title_Marketing Specialist,Job_Title_Operations Manager,Job_Title_Product Manager,Job_Title_Sales Manager,Job_Title_Software Engineer,Job_Title_UX Designer,...,Required_Skills_JavaScript,Required_Skills_Machine Learning,Required_Skills_Marketing,Required_Skills_Project Management,Required_Skills_Python,Required_Skills_Sales,Required_Skills_UX/UI Design,Remote_Friendly_Yes,Job_Growth_Projection_Growth,Job_Growth_Projection_Stable
0,111392.165243,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
1,93792.562466,False,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,107170.263069,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,False
3,93027.953758,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
4,87752.922171,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,105821.394046,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,True
496,119794.992146,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
497,79644.933099,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,True
498,77642.150625,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,True,False,True


In [8]:
    import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'Size': ['S', 'M', 'L', 'L', 'M']
})

# Apply get_dummies to create dummy variables
dummies = pd.get_dummies(df)
dummies


Unnamed: 0,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,False,False,True,False,False,True
1,True,False,False,False,True,False
2,False,True,False,True,False,False
3,False,False,True,True,False,False
4,True,False,False,False,True,False


In [16]:
# Apply get_dummies on only the 'Color' column
dummies = pd.get_dummies(df, columns=['Color'])
dummies

Unnamed: 0,Size,Color_Blue,Color_Green,Color_Red
0,S,False,False,True
1,M,True,False,False
2,L,False,True,False
3,L,False,False,True
4,M,True,False,False


In [17]:
# Apply get_dummies with drop_first=True
dummies = pd.get_dummies(df, drop_first=True)
dummies

Unnamed: 0,Color_Green,Color_Red,Size_M,Size_S
0,False,True,False,True
1,False,False,True,False
2,True,False,False,False
3,False,True,False,False
4,False,False,True,False


In [18]:
# Apply get_dummies with dtype=np.int8
dummies = pd.get_dummies(df, dtype='int8')
dummies.dtypes

Color_Blue     int8
Color_Green    int8
Color_Red      int8
Size_L         int8
Size_M         int8
Size_S         int8
dtype: object

In [None]:
There are two types of categorical variables?
1. Oridinal variables: categories can be meaningfully ordered 

In [1]:
# Import necessary libraries
import pandas as pd

# Sample dataset
data = {
    'Country': ['USA', 'France', 'Germany', 'USA', 'Germany', 'France'],
    'Purchased': ['Yes', 'No', 'No', 'Yes', 'Yes', 'No']
}

# Create a DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,Country,Purchased
0,USA,Yes
1,France,No
2,Germany,No
3,USA,Yes
4,Germany,Yes
5,France,No


In [2]:
# Use get_dummies() to one-hot encode the 'Country' column
df_encoded = pd.get_dummies(df, columns=['Country'])

# Show the resulting DataFrame
print(df_encoded)

  Purchased  Country_France  Country_Germany  Country_USA
0       Yes           False            False         True
1        No            True            False        False
2        No           False             True        False
3       Yes           False            False         True
4       Yes           False             True        False
5        No            True            False        False


In [3]:
df_encoded = pd.get_dummies(df, columns=['Country'], drop_first=True)
df_encoded

Unnamed: 0,Purchased,Country_Germany,Country_USA
0,Yes,False,True
1,No,False,False
2,No,True,False
3,Yes,False,True
4,Yes,True,False
5,No,False,False


In [13]:
import pandas as pd

# Sample dataset
data = {
    'Country': ['USA', None, 'Germany', 'France', 'Germany', 'USA'],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No'],
    'Age': [25, 30, 35, 40, 45, 50]
}

# Creating a DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Country,Purchased,Age
0,USA,Yes,25
1,,No,30
2,Germany,Yes,35
3,France,No,40
4,Germany,Yes,45
5,USA,No,50


In [5]:
# Using get_dummies with a prefix and custom separator
df_dummies_prefix = pd.get_dummies(df, columns=['Country'], prefix='Region', prefix_sep='-', drop_first=True)
df_dummies_prefix

Unnamed: 0,Purchased,Age,Region-Germany,Region-USA
0,Yes,25,False,True
1,No,30,False,False
2,Yes,35,True,False
3,No,40,False,False
4,Yes,45,True,False
5,No,50,False,True


In [14]:
df

Unnamed: 0,Country,Purchased,Age
0,USA,Yes,25
1,,No,30
2,Germany,Yes,35
3,France,No,40
4,Germany,Yes,45
5,USA,No,50


In [15]:
df_dummies_nan = pd.get_dummies(df, columns=['Country'], dummy_na=True)
df_dummies_nan

Unnamed: 0,Purchased,Age,Country_France,Country_Germany,Country_USA,Country_nan
0,Yes,25,False,False,True,False
1,No,30,False,False,False,True
2,Yes,35,False,True,False,False
3,No,40,True,False,False,False
4,Yes,45,False,True,False,False
5,No,50,False,False,True,False
