##  <br>FEATURE ENCODING </br>

<p style='font-size:18px'> <b>Used in the categorical non numerical data </b></p>


<b> Types of the categorical data</b>
* Nominal data:It has no inherent order
    - One hot endcoding is done for the nominal data
* Ordinal data
    - Ordinal Encoding is done for the ordinal data
    - label Encoding is done for the output labels

### 1. Ordinal Encoding ###
for eg:
| **education level**| **encoded value**|
|------------------|--------------------|
|under-graduate| 1|
|high school|0|
|graduate|2|
|high school|0|

In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [66]:
df=pd.read_csv('Student_performance_10k.csv')

In [67]:
df.sample(5)

Unnamed: 0,roll_no,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,science_score,total_score,grade
1907,std-1908,male,group C,master's degree,1.0,1.0,33,92.0,64.0,66.0,255.0,B
2224,std-2225,male,group E,bachelor's degree,1.0,0.0,58,82.0,51.0,67.0,258.0,B
537,std-538,male,group C,some high school,1.0,1.0,47,48.0,78.0,89.0,262.0,B
9890,std-9891,female,group E,associate's degree,1.0,0.0,84,86.0,72.0,99.0,341.0,A
6006,std-6007,female,group C,high school,1.0,0.0,85,60.0,74.0,72.0,291.0,B


In [68]:
df.describe()

Unnamed: 0,lunch,test_preparation_course,reading_score,writing_score,science_score,total_score
count,9976.0,9977.0,9975.0,9976.0,9977.0,9981.0
mean,0.644246,0.388694,70.125915,71.415798,66.063045,264.740908
std,0.478765,0.487478,19.026245,18.24536,19.324331,42.304858
min,0.0,0.0,17.0,10.0,9.0,89.0
25%,0.0,0.0,57.0,59.0,53.0,237.0
50%,1.0,0.0,71.0,72.5,67.0,268.0
75%,1.0,1.0,85.0,85.0,81.0,294.0
max,1.0,1.0,100.0,100.0,100.0,383.0


In [69]:
df['race_ethnicity'].unique()

array(['group D', 'group B', 'group C', 'group E', 'group A', 'A', nan,
       'C', 'D', 'group C\\n', 'B', 'E'], dtype=object)

In [70]:
map_values = {
    'C': 'group C',
    'group C': 'group C',
    'group C\n': 'group C',  # Note single backslash for actual newline
    'A': 'group A',
    'E': 'group E',
    'D': 'group D',
    'B': 'group B',
    'group A':'group A',
    'group B':'group B',
    'group C':'group C',
    'group D':'group D',
    'group E':'group E'
}
df['race_ethnicity'] = df['race_ethnicity'].map(map_values)

In [71]:
df['race_ethnicity']

0       group D
1       group B
2       group C
3       group D
4       group C
         ...   
9995    group C
9996    group C
9997    group B
9998    group B
9999    group B
Name: race_ethnicity, Length: 10000, dtype: object

In [72]:
df['race_ethnicity'].unique()

array(['group D', 'group B', 'group C', 'group E', 'group A', nan],
      dtype=object)

In [73]:
df.isnull().sum()

roll_no                         1
gender                         18
race_ethnicity                 27
parental_level_of_education    22
lunch                          24
test_preparation_course        23
math_score                     24
reading_score                  25
writing_score                  24
science_score                  23
total_score                    19
grade                           3
dtype: int64

In [74]:
df.dropna(axis=0,inplace=True,subset=['race_ethnicity','lunch','gender','roll_no','test_preparation_course','parental_level_of_education'])

In [75]:
df.isnull().sum()

roll_no                         0
gender                          0
race_ethnicity                  0
parental_level_of_education     0
lunch                           0
test_preparation_course         0
math_score                     23
reading_score                  24
writing_score                  24
science_score                  23
total_score                    19
grade                           3
dtype: int64

In [76]:
df.describe()

Unnamed: 0,lunch,test_preparation_course,reading_score,writing_score,science_score,total_score
count,9888.0,9888.0,9864.0,9864.0,9865.0,9869.0
mean,0.644013,0.389159,70.120742,71.410483,66.052205,264.76948
std,0.478836,0.487584,19.028816,18.23338,19.320498,42.304509
min,0.0,0.0,17.0,10.0,9.0,89.0
25%,0.0,0.0,57.0,59.0,53.0,237.0
50%,1.0,0.0,71.0,73.0,67.0,268.0
75%,1.0,1.0,85.0,85.0,81.0,295.0
max,1.0,1.0,100.0,100.0,100.0,383.0


In [77]:
df.head()

Unnamed: 0,roll_no,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,science_score,total_score,grade
0,std-01,male,group D,some college,1.0,1.0,89,38.0,85.0,26.0,238.0,C
1,std-02,male,group B,high school,1.0,0.0,65,100.0,67.0,96.0,328.0,A
2,std-03,male,group C,master's degree,1.0,0.0,10,99.0,97.0,58.0,264.0,B
3,std-04,male,group D,some college,1.0,1.0,22,51.0,41.0,84.0,198.0,D
4,std-05,male,group C,some college,0.0,1.0,26,58.0,64.0,65.0,213.0,C


In [78]:
print(df['parental_level_of_education'].unique())
print(df['grade'].unique())

['some college' 'high school' "master's degree" "associate's degree"
 'some high school' "bachelor's degree"]
['C' 'A' 'B' 'D' nan 'Fail']


In [79]:
from sklearn.model_selection import train_test_split
X_data=df[['parental_level_of_education']]
Y_data=df[['grade']]
X_train,X_test,Y_train,Y_test=train_test_split(X_data,Y_data,test_size=0.3,random_state=20)



In [80]:
from sklearn.preprocessing import OrdinalEncoder
scaler = OrdinalEncoder(categories=[
    ['some high school', 'high school', 'some college', "bachelor's degree", "associate's degree", "master's degree"]
])
scaler.fit(X_train)
X_train_encoded = scaler.transform(X_train)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=X_train.columns)


In [81]:
X_train.head()

Unnamed: 0,parental_level_of_education
8884,some high school
3211,bachelor's degree
4446,some college
5192,some college
4622,some college


In [82]:
X_train_encoded.head()

Unnamed: 0,parental_level_of_education
0,0.0
1,3.0
2,2.0
3,2.0
4,2.0


In [84]:
X_test_encoded=scaler.transform(X_test)
X_test_encoded=pd.DataFrame(X_test_encoded,columns=X_test.columns)

In [85]:
X_test.head()

Unnamed: 0,parental_level_of_education
8150,some college
4405,bachelor's degree
4563,some high school
263,associate's degree
1743,some college


In [86]:
X_test_encoded.head()

Unnamed: 0,parental_level_of_education
0,2.0
1,3.0
2,0.0
3,4.0
4,2.0


### 2. Label Encoding

In [88]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(Y_train)
Y_train_encoded=le.transform(Y_train)
Y_train_encoded=pd.DataFrame(Y_train_encoded,columns=Y_train.columns)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [91]:
le.classes_

array(['A', 'B', 'C', 'D', 'Fail', nan], dtype=object)

In [90]:
Y_train_encoded.head()

Unnamed: 0,grade
0,1
1,1
2,1
3,2
4,3
