<a href="https://colab.research.google.com/github/havaledar/ECON3740/blob/main/W24_ECON3740_Lab_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
!pip install gdown > /dev/null 2>&1
!pip install stargazer > /dev/null 2>&1

import gdown
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

## Downloading

In [2]:
url = 'https://drive.google.com/file/d/1m0d9gZZBfHm7qGerKKLN7UzWaNUbQi0R/view?usp=sharing'
output_filename = 'lfs.dta'
gdown.download(url, output_filename, fuzzy=True ,quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1m0d9gZZBfHm7qGerKKLN7UzWaNUbQi0R
To: /content/lfs.dta
100%|██████████| 13.6M/13.6M [00:00<00:00, 74.4MB/s]


'lfs.dta'

## Reading

In [3]:
data = pd.read_stata('lfs.dta', convert_categoricals=True)

#Questtion 1

## Dummy variables

In [33]:
pd.get_dummies(data['SEX'])

Unnamed: 0,Male,Female
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
103398,0,1
103399,1,0
103400,0,1
103401,0,1


In [4]:
# Convert 'SEX' column to categorical and get dummies
sex_dummies = pd.get_dummies(data['SEX'], prefix='gender')
sex_dummies

Unnamed: 0,gender_Male,gender_Female
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
103398,0,1
103399,1,0
103400,0,1
103401,0,1


In [5]:
# Take only one column as 0 and 1 are inversely related
data['gender'] = sex_dummies["gender_Female"]

In [6]:
# Display the first few rows of the DataFrame with the new 'gender' column
print(data[['SEX', 'gender']].head())

      SEX  gender
0  Female       1
1    Male       0
2    Male       0
3    Male       0
4    Male       0


If we import the values, in addition to 'get_dummies', we can use this method too.
```python
data['sex'] = data['SEX'] - 1
```

## Mean

In [7]:
data[data['gender'] == 1]['HRLYEARN'].mean().round(2)

34.94

In [8]:
data[data['gender'] == 0]['HRLYEARN'].mean().round(2)

30.92

In [9]:
(data[data['gender'] == 0]['HRLYEARN'].mean() - data[data['gender'] == 1]['HRLYEARN'].mean()).round(2)

4.02

## Group by

In [10]:
data.groupby('SEX')['HRLYEARN'].mean()


Unnamed: 0,SEX,Wage
0,Male,34.937034
1,Female,30.918263


## Regression

In [11]:
# Create a simple linear regression model
results_1 = smf.ols('HRLYEARN ~ SEX', data).fit()

# Display the regression results
print(results_1.summary())

                            OLS Regression Results                            
Dep. Variable:               HRLYEARN   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     692.6
Date:                Fri, 15 Mar 2024   Prob (F-statistic):          1.15e-151
Time:                        14:29:05   Log-Likelihood:            -2.2925e+05
No. Observations:               53443   AIC:                         4.585e+05
Df Residuals:                   53441   BIC:                         4.585e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        34.9370      0.107    326.087

## Log

In [12]:
data['lwage'] = np.log(data['HRLYEARN'])

In [13]:
data['HRLYEARN'].isna().sum()

49960

In [14]:
# Create a simple linear regression model
results_2 = smf.ols('lwage ~ EDUC + AGE_12 + SEX + MARSTAT + PROV', data).fit()

# Display the regression results
Stargazer([results_2])

0,1
,
,Dependent variable: lwage
,
,(1)
,
AGE_12[T.20 to 24 years],0.068***
,(0.009)
AGE_12[T.25 to 29 years],0.222***
,(0.009)
AGE_12[T.30 to 34 years],0.326***


In [15]:
# Create a simple linear regression model
results_3 = smf.ols('lwage ~ EDUC * SEX', data).fit()

# Display the regression results
Stargazer([results_3])

0,1
,
,Dependent variable: lwage
,
,(1)
,
EDUC[T.Above bachelor's degree],0.644***
,(0.022)
EDUC[T.Above bachelor's degree]:SEX[T.Female],0.130***
,(0.035)
EDUC[T.Bachelor's degree],0.496***


# Question 2

## Participation

In [16]:
data['LFSSTAT'].unique()

['Unemployed', 'Employed, at work', 'Not in labour force', 'Employed, absent from work']
Categories (4, object): ['Employed, at work' < 'Employed, absent from work' < 'Unemployed' <
                         'Not in labour force']

In [17]:
data['partic'] = (data['LFSSTAT'] != 'Not in labour force').astype(int)

In [18]:
data[['LFSSTAT', 'partic']]

Unnamed: 0,LFSSTAT,partic
0,Unemployed,1
1,"Employed, at work",1
2,"Employed, at work",1
3,Not in labour force,0
4,Not in labour force,0
...,...,...
103398,"Employed, at work",1
103399,Unemployed,1
103400,"Employed, at work",1
103401,"Employed, at work",1


## Method 1

In [19]:
data['partic'].value_counts(normalize=True)

1    0.637341
0    0.362659
Name: partic, dtype: float64

In [20]:
pd.crosstab(data['partic'], data['SEX'], normalize=1)

SEX,Male,Female
partic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.318979,0.403955
1,0.681021,0.596045


## Method 2

In [21]:
# Filtering based on SEX == 1
dm = data[data['SEX'] == "Male"]

# Filtering based on SEX == 2
df = data[data['SEX'] == "Female"]

In [22]:
dm['partic'].describe()

count    50251.000000
mean         0.681021
std          0.466085
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: partic, dtype: float64

In [23]:
df['partic'].describe()

count    53152.000000
mean         0.596045
std          0.490693
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: partic, dtype: float64

## Married women

In [24]:
df_married = df[((df['MARSTAT'] == "Married") |
                 (df['MARSTAT'] == "Living in common-law")) &
                (df['SEX'] == "Female")]

In [25]:
results_4 = smf.ols('partic ~ EDUC + AGE_12 + PROV + AGYOWNK', data=df_married).fit()
Stargazer([results_4])

0,1
,
,Dependent variable: partic
,
,(1)
,
AGE_12[T.20 to 24 years],0.587**
,(0.251)
AGE_12[T.25 to 29 years],0.707***
,(0.249)
AGE_12[T.30 to 34 years],0.732***


In [26]:
df_married['AGYOWNK'].value_counts(dropna=False)

NaN                                 19360
Youngest child less than 6 years     4371
Youngest child 6 to 12 years         3465
Youngest child 13 to 17 years        2122
Youngest child 18 to 24 years        1958
Name: AGYOWNK, dtype: int64

In [27]:
df_married['AGYOWNK'] = df_married['AGYOWNK'].cat.add_categories('No children')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_married['AGYOWNK'] = df_married['AGYOWNK'].cat.add_categories('No children')


In [28]:
df_married['AGYOWNK'].fillna('No children', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_married['AGYOWNK'].fillna('No children', inplace = True)


In [29]:
df_married['AGYOWNK'].value_counts(dropna=False)

No children                         19360
Youngest child less than 6 years     4371
Youngest child 6 to 12 years         3465
Youngest child 13 to 17 years        2122
Youngest child 18 to 24 years        1958
Name: AGYOWNK, dtype: int64

In [30]:
results_5 = smf.ols('partic ~ EDUC + AGE_12 + PROV + AGYOWNK', data=df_married).fit()
Stargazer([results_4, results_5])

0,1,2
,,
,Dependent variable: partic,Dependent variable: partic
,,
,(1),(2)
,,
AGE_12[T.20 to 24 years],0.587**,0.005
,(0.251),(0.079)
AGE_12[T.25 to 29 years],0.707***,0.047
,(0.249),(0.077)
AGE_12[T.30 to 34 years],0.732***,0.050


In [31]:
!rclone config

/bin/bash: line 1: rclone: command not found


In [32]:
!rclone lsd remote:

/bin/bash: line 1: rclone: command not found
