### 환경준비

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

In [28]:
data = pd.read_csv("ds_salaries.csv")

In [29]:
data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


In [31]:
#unnamed : 인덱스로 추측할수 있다
# work_year: 근속년수
# experience_level : 경험치? EN Entry-level / Junior MI Mid-level / Intermediate SE Senior-level / Expert EX Executive-level / Director 이렇게 있다고 한다
# employment type : 고용타입  PT Part-time FT Full-time CT Contract FL Freelance 존재
# job_title: 직무 	The role worked in during the year.
# salary : 급여
# salary_currency : 급여 지급 단위 (원, 달러, 루피 등등)
# salary_in_usd : 달러기준 급여
# employee_residence : 직원 거주지
# company_location : 회사 위치
# company_size : 회사 크기,(직원 수로 정해짐 S less than 50 employees (small) M 50 to 250 employees (medium) L more than 250 employees (large))

In [32]:
# 단위가 통합된 salary_in_usd를 target으로 하고 나머지 salary는 날리자.
data.drop(["Unnamed: 0", "salary", "salary_currency"], axis = 1, inplace= True)
target = "salary_in_usd"
y = data[target]
x = data.drop([target], axis = 1)

### 단변량 분석

In [41]:
corr_matrix = data.corr()
corr_matrix[target].sort_values(ascending = False)

salary_in_usd    1.000000
work_year        0.170493
remote_ratio     0.132122
Name: salary_in_usd, dtype: float64

#### 데이터 전처리

1. NA 확인

In [33]:
print(x.isna().sum())
print(y.isna().sum())

work_year             0
experience_level      0
employment_type       0
job_title             0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64
0


2. 가변수화

In [34]:
dummy_vals = ["experience_level", "employment_type", "job_title", "employee_residence", "company_location", "company_size"]

x = pd.get_dummies(x, dummy_vals, drop_first=True)
x.head()

Unnamed: 0,work_year,remote_ratio,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_FL,employment_type_FT,employment_type_PT,job_title_AI Scientist,job_title_Analytics Engineer,...,company_location_RO,company_location_RU,company_location_SG,company_location_SI,company_location_TR,company_location_UA,company_location_US,company_location_VN,company_size_M,company_size_S
0,2020,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2020,50,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2020,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2020,50,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [36]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2)

#### Linear Regression

In [38]:
# 모델링용
from sklearn.linear_model import LinearRegression

# 회귀모델 평가용
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [40]:
model1 = LinearRegression()
model1.fit(x_train, y_train)
pred1 = model1.predict(x_val)