In [1]:
import numpy as np
import pandas as pd


## Read the `Wage` File

In [2]:
wage_df = pd.read_csv("wage.csv")

In [3]:
wage_df.head(3)

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177


In [4]:
wage_df.dtypes #we can see that everything except for year age (log) wage are categorical

year            int64
age             int64
maritl         object
race           object
education      object
region         object
jobclass       object
health         object
health_ins     object
logwage       float64
wage          float64
dtype: object

### Check unique values in the `jobclass` column

In [5]:
wage_df.jobclass.unique() #only two -- easy to replace

array(['1. Industrial', '2. Information'], dtype=object)

In [6]:
wage_df["job_information"] =  (wage_df["jobclass"] == "2. Information").astype(int) #so that =1 means information

In [7]:
wage_df.drop(['jobclass', 'logwage', 'region', 'year'], axis=1, inplace=True) #lets delete jobclass and logwage now

### Check unique values in the `health` column

In [8]:
wage_df.health.unique() #only two -- easy to replace

array(['1. <=Good', '2. >=Very Good'], dtype=object)

In [9]:
wage_df["health"] =  (wage_df["health"] == "2. >=Very Good").astype(int) #so that =1 means very good health

### Apply the same for `health_ins`

In [10]:
wage_df.health_ins.unique() #only two -- easy to replace

array(['2. No', '1. Yes'], dtype=object)

In [11]:
wage_df["health_ins"] =  (wage_df["health_ins"] == "1. Yes").astype(int) #so that =1 means has a health insurance

### Check unique values in the `maritl` column

In [12]:
wage_df.maritl.unique() #we cannot make this 1-2-3-4-5 as this is nominal

array(['1. Never Married', '2. Married', '4. Divorced', '3. Widowed',
       '5. Separated'], dtype=object)

In [13]:
one_hot = pd.get_dummies(wage_df.maritl, prefix='marriage')

In [14]:
wage_df = wage_df.join(one_hot)

In [15]:
wage_df.drop(['maritl', 'marriage_1. Never Married'], axis=1, inplace=True)

In [16]:
wage_df.columns = [*wage_df.columns[:-4], 'marriage_yes',\
                   'marriage_widowed', 'marriage_divorced', 'marriage_separated'] #we drop one -- why? [Jing]marriage_yes can represent 'never married' and 'married' columns

### Decide what to do for `education`

In [17]:
wage_df.education.unique() #it looks like we can take these as ordinal categories

array(['1. < HS Grad', '4. College Grad', '3. Some College', '2. HS Grad',
       '5. Advanced Degree'], dtype=object)

In [18]:
wage_df.education = (wage_df.education.astype(str).str[0]).astype(int)

### Question to deliver on encoding categorical variables
- Encode the `race` variable by using numerical variable(s) and drop `race`.
- Now that all the variables in our dataframe is numerical, apply a linear regression model to explain the `wage` with the other variables. Report the coefficients.
- Interpret the coefficient of the `education` varaible.

#### First encode the variable. Then fit a linear model.

In [19]:
wage_df.head(10)

Unnamed: 0,age,race,education,health,health_ins,wage,job_information,marriage_yes,marriage_widowed,marriage_divorced,marriage_separated
0,18,1. White,1,0,0,75.043154,0,0,0,0,0
1,24,1. White,4,1,0,70.47602,1,0,0,0,0
2,45,1. White,3,0,1,130.982177,0,1,0,0,0
3,43,3. Asian,4,1,1,154.685293,1,1,0,0,0
4,50,1. White,2,0,1,75.043154,1,0,0,1,0
5,54,1. White,4,1,1,127.115744,1,1,0,0,0
6,44,4. Other,3,1,1,169.528538,0,1,0,0,0
7,30,3. Asian,3,0,1,111.720849,1,0,0,0,0
8,41,2. Black,3,1,1,118.884359,1,0,0,0,0
9,52,1. White,2,1,1,128.680488,1,1,0,0,0


In [20]:
wage_df.race.unique() #only two -- easy to replace

array(['1. White', '3. Asian', '4. Other', '2. Black'], dtype=object)

In [21]:
# Encoding 'race' using one-hot encoding
race_dummies = pd.get_dummies(wage_df.race, prefix='race')
wage_df = wage_df.join(race_dummies)


In [22]:
wage_df.head(3)

Unnamed: 0,age,race,education,health,health_ins,wage,job_information,marriage_yes,marriage_widowed,marriage_divorced,marriage_separated,race_1. White,race_2. Black,race_3. Asian,race_4. Other
0,18,1. White,1,0,0,75.043154,0,0,0,0,0,1,0,0,0
1,24,1. White,4,1,0,70.47602,1,0,0,0,0,1,0,0,0
2,45,1. White,3,0,1,130.982177,0,1,0,0,0,1,0,0,0


In [24]:
wage_df.drop(['race','race_4. Other'], axis=1, inplace=True)

In [25]:
wage_df.head(3)

Unnamed: 0,age,education,health,health_ins,wage,job_information,marriage_yes,marriage_widowed,marriage_divorced,marriage_separated,race_1. White,race_2. Black,race_3. Asian
0,18,1,0,0,75.043154,0,0,0,0,0,1,0,0
1,24,4,1,0,70.47602,1,0,0,0,0,1,0,0
2,45,3,0,1,130.982177,0,1,0,0,0,1,0,0


In [None]:
from sklearn.linear_model import LinearRegression

In [25]:
# Fit a linear regression model
X = wage_df.drop('wage', axis=1)  # All columns except 'wage'
y = wage_df['wage']  # 'wage' column

reg = LinearRegression().fit(X,y) #hint: to get coefficients of the fitted model use "reg.coef_"

# Coefficients of the fitted model
coefficients = reg.coef_

# Interpretation of the 'education' coefficient
education_coef = coefficients[X.columns.tolist().index('education')]
print(f"Coefficient for education: {education_coef}")

# You may also want to print the entire coefficients for all features
print("Coefficients for all features:")
print(dict(zip(X.columns, coefficients)))

Coefficient for education: 13.344081064362944
Coefficients for all features:
{'age': 0.3037475967955372, 'education': 13.344081064362944, 'health': 6.6042296326767564, 'health_ins': 16.996044805507147, 'job_information': 3.7856459769041266, 'marriage_yes': 17.179947269886267, 'marriage_widowed': 0.9727074421567062, 'marriage_divorced': 3.491899032315769, 'marriage_separated': 12.13254898695342, 'race_1. White': 2.841194038310302, 'race_2. Black': -2.3011198843652, 'race_3. Asian': 1.3530328024171732, 'race_4. Other': -1.8931069563622769}


[Jing]: The coefficient for education is 13.344, which is positive. This suggests that higher education levels are associated with higer wages. 