# Extracting more data from the locations

In [12]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
# load data
df = pd.read_csv('data/encoded_data.csv')
df.head()

Unnamed: 0,work_year,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,same_country,company_size_L,company_size_M,company_size_S
0,2023,Data Scientist,80000,EUR,85847,ES,100,ES,True,True,True,False,False,False,True,False,True,True,False,False
1,2023,ML Engineer,30000,USD,30000,US,100,US,True,True,False,False,True,False,False,False,True,False,False,True
2,2023,ML Engineer,25500,USD,25500,US,100,US,True,True,False,False,True,False,False,False,True,False,False,True
3,2023,Data Scientist,175000,USD,175000,CA,100,CA,True,True,True,False,False,False,True,False,True,False,True,False
4,2023,Data Scientist,120000,USD,120000,CA,100,CA,True,True,True,False,False,False,True,False,True,False,True,False


In [14]:
pd.set_option('display.max_rows', None)
df['company_location'].value_counts()

company_location
US    3040
GB     172
CA      87
ES      77
IN      58
DE      56
FR      34
BR      15
AU      14
GR      14
PT      14
NL      13
MX      10
IE       7
SG       6
AT       6
JP       6
TR       5
CH       5
NG       5
PL       5
PK       4
LV       4
DK       4
IT       4
PR       4
SI       4
BE       4
CO       4
UA       4
HR       3
TH       3
RU       3
AR       3
CZ       3
AE       3
FI       3
AS       3
LU       3
HU       2
ID       2
LT       2
RO       2
SE       2
KE       2
EE       2
CF       2
IL       2
GH       2
EG       1
MD       1
CL       1
NZ       1
CN       1
IQ       1
DZ       1
HK       1
HN       1
MY       1
AL       1
MA       1
PH       1
BO       1
VN       1
AM       1
BA       1
SK       1
MK       1
BS       1
IR       1
CR       1
MT       1
Name: count, dtype: int64

In [15]:
pd.set_option('display.max_rows', None)
df['employee_residence'].value_counts()

employee_residence
US    3004
GB     167
CA      85
ES      80
IN      71
DE      48
FR      38
PT      18
BR      18
GR      16
NL      15
AU      11
MX      10
IT       8
PK       8
JP       7
IE       7
NG       7
AT       6
AR       6
PL       6
PR       5
TR       5
BE       5
SG       5
RU       4
LV       4
UA       4
CO       4
CH       4
SI       4
BO       3
DK       3
HR       3
HU       3
RO       3
TH       3
AE       3
VN       3
HK       2
UZ       2
PH       2
CF       2
CL       2
FI       2
CZ       2
SE       2
AS       2
LT       2
GH       2
KE       2
DZ       1
NZ       1
JE       1
MY       1
MD       1
IQ       1
BG       1
LU       1
RS       1
HN       1
EE       1
TN       1
CR       1
ID       1
EG       1
DO       1
CN       1
SK       1
IR       1
MA       1
IL       1
MK       1
BA       1
AM       1
CY       1
KW       1
MT       1
Name: count, dtype: int64

In [16]:
# make it so any country that shows up less than 5 times is grouped as other
df['company_location'] = df['company_location'].apply(lambda x: 'Other' if x in df['company_location'].value_counts().index[df['company_location'].value_counts() < 5] else x)
df['employee_residence'] = df['employee_residence'].apply(lambda x: 'Other' if x in df['employee_residence'].value_counts().index[df['employee_residence'].value_counts() < 5] else x)
df['company_location'].value_counts()

company_location
US       3040
GB        172
Other     106
CA         87
ES         77
IN         58
DE         56
FR         34
BR         15
GR         14
PT         14
AU         14
NL         13
MX         10
IE          7
SG          6
AT          6
JP          6
CH          5
NG          5
PL          5
TR          5
Name: count, dtype: int64

In [17]:
df['employee_residence'].value_counts()

employee_residence
US       3004
GB        167
Other      99
CA         85
ES         80
IN         71
DE         48
FR         38
BR         18
PT         18
GR         16
NL         15
AU         11
MX         10
IT          8
PK          8
JP          7
IE          7
NG          7
PL          6
AR          6
AT          6
SG          5
BE          5
TR          5
PR          5
Name: count, dtype: int64

In [18]:
# one hot encode and see how it affects the model
df = pd.get_dummies(df, columns=['company_location', 'employee_residence'])

df.head()

Unnamed: 0,work_year,job_title,salary,salary_currency,salary_in_usd,remote_ratio,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,...,employee_residence_NG,employee_residence_NL,employee_residence_Other,employee_residence_PK,employee_residence_PL,employee_residence_PR,employee_residence_PT,employee_residence_SG,employee_residence_TR,employee_residence_US
0,2023,Data Scientist,80000,EUR,85847,100,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,2023,ML Engineer,30000,USD,30000,100,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
2,2023,ML Engineer,25500,USD,25500,100,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
3,2023,Data Scientist,175000,USD,175000,100,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2023,Data Scientist,120000,USD,120000,100,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
df = df.drop(columns=['job_title', 'salary'], axis=1)
df.head()

Unnamed: 0,work_year,salary_currency,salary_in_usd,remote_ratio,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,...,employee_residence_NG,employee_residence_NL,employee_residence_Other,employee_residence_PK,employee_residence_PL,employee_residence_PR,employee_residence_PT,employee_residence_SG,employee_residence_TR,employee_residence_US
0,2023,EUR,85847,100,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2023,USD,30000,100,True,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
2,2023,USD,25500,100,True,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
3,2023,USD,175000,100,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2023,USD,120000,100,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
df['salary_currency'].value_counts()

salary_currency
USD    3224
EUR     236
GBP     161
INR      60
CAD      25
AUD       9
SGD       6
BRL       6
PLN       5
CHF       4
HUF       3
DKK       3
JPY       3
TRY       3
THB       2
ILS       1
HKD       1
CZK       1
MXN       1
CLP       1
Name: count, dtype: int64

: 

In [9]:
# one hot encode salary currency
df = pd.get_dummies(df, columns=['salary_currency'])
df.head()

Unnamed: 0,work_year,salary_in_usd,remote_ratio,experience_level_EN,experience_level_MI,experience_level_SE,experience_level_EX,employment_type_CT,employment_type_FL,employment_type_FT,...,salary_currency_HUF,salary_currency_ILS,salary_currency_INR,salary_currency_JPY,salary_currency_MXN,salary_currency_PLN,salary_currency_SGD,salary_currency_THB,salary_currency_TRY,salary_currency_USD
0,2023,85847,100,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,2023,30000,100,True,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
2,2023,25500,100,True,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
3,2023,175000,100,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,2023,120000,100,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True


In [None]:
# linear regression model
x_train, x_test, y_train, y_test = train_test_split(df.drop('salary_in_usd', axis=1), df['salary_in_usd'], test_size=0.2, random_state=1)
model = LinearRegression()
model.fit(x_train, y_train)

# Make predictions on the test set
predictions = model.predict(x_test)

# Evaluate the model
print(f"Mean Squared Error: {mean_squared_error(y_test, predictions)}")
print(f"R-squared: {r2_score(y_test, predictions)}")
print(f"Accuracy: {model.score(x_test, y_test)}")

# Linear regression doesn't have feature_importances_, use coefficients instead
feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Importance': abs(model.coef_)})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print(feature_importance)


Mean Squared Error: 2716889853.459754
R-squared: 0.32568961562562804
Accuracy: 0.32568961562562804

Top 10 most important features:
                     Feature    Importance
73       salary_currency_ILS  3.230641e+05
28       company_location_NG  1.060175e+05
52     employee_residence_NG  1.041989e+05
77       salary_currency_PLN  8.600192e+04
33       company_location_SG  6.296517e+04
79       salary_currency_THB  5.810472e+04
75       salary_currency_JPY  5.802274e+04
57     employee_residence_PR  5.664361e+04
62       salary_currency_AUD  5.613254e+04
65       salary_currency_CHF  5.507949e+04
36     employee_residence_AR  5.281840e+04
67       salary_currency_CZK  4.783202e+04
5        experience_level_EX  4.335965e+04
76       salary_currency_MXN  4.303754e+04
56     employee_residence_PL  4.176523e+04
16       company_location_BR  3.961300e+04
34       company_location_TR  3.827874e+04
60     employee_residence_TR  3.827874e+04
55     employee_residence_PK  3.616679e+04
39     e