# Stack over flow salary survey

### importing libraries

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Data preprocessing

In [13]:
ds = pd.read_csv("../Data/survey_results_public.csv")
ds.head()

Unnamed: 0,ResponseId,Age,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,Country,LanguageHaveWorkedWith,JobSat
0,1,Under 18 years old,"Employed, full-time",Remote,Primary/elementary school,,,United States of America,,
1,2,35-44 years old,"Employed, full-time",Remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20.0,17.0,United Kingdom of Great Britain and Northern I...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,
2,3,45-54 years old,"Employed, full-time",Remote,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",37.0,27.0,United Kingdom of Great Britain and Northern I...,C#,
3,4,18-24 years old,"Student, full-time",,Some college/university study without earning ...,4.0,,Canada,C;C++;HTML/CSS;Java;JavaScript;PHP;PowerShell;...,
4,5,18-24 years old,"Student, full-time",,"Secondary school (e.g. American high school, G...",9.0,,Norway,C++;HTML/CSS;JavaScript;Lua;Python;Rust,


### Handle missing data

In [14]:
from sklearn.impute import SimpleImputer

columns_to_impute = [
    "Age",
    "Employment",
    "RemoteWork",
    "EdLevel",
    "Country",
    "LanguageHaveWorkedWith",
    "YearsCode",
    "YearsCodePro",
    "JobSat",
]

imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer.fit(ds[columns_to_impute])
ds[columns_to_impute] = imputer.transform(ds[columns_to_impute])

In [15]:
ds.head()

Unnamed: 0,ResponseId,Age,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,Country,LanguageHaveWorkedWith,JobSat
0,1,Under 18 years old,"Employed, full-time",Remote,Primary/elementary school,10,2,United States of America,HTML/CSS;JavaScript;TypeScript,8.0
1,2,35-44 years old,"Employed, full-time",Remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20,17,United Kingdom of Great Britain and Northern I...,Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...,8.0
2,3,45-54 years old,"Employed, full-time",Remote,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",37,27,United Kingdom of Great Britain and Northern I...,C#,8.0
3,4,18-24 years old,"Student, full-time","Hybrid (some remote, some in-person)",Some college/university study without earning ...,4,2,Canada,C;C++;HTML/CSS;Java;JavaScript;PHP;PowerShell;...,8.0
4,5,18-24 years old,"Student, full-time","Hybrid (some remote, some in-person)","Secondary school (e.g. American high school, G...",9,2,Norway,C++;HTML/CSS;JavaScript;Lua;Python;Rust,8.0


In [16]:
ds.isnull().sum()

ResponseId                0
Age                       0
Employment                0
RemoteWork                0
EdLevel                   0
YearsCode                 0
YearsCodePro              0
Country                   0
LanguageHaveWorkedWith    0
JobSat                    0
dtype: int64

### Now that we handled the missing data, we split the dataset

In [17]:
X = ds.iloc[:, 1:-1].values
y = ds.iloc[:, -1].values

print(X)

[['Under 18 years old' 'Employed, full-time' 'Remote' ... '2'
  'United States of America' 'HTML/CSS;JavaScript;TypeScript']
 ['35-44 years old' 'Employed, full-time' 'Remote' ... '17'
  'United Kingdom of Great Britain and Northern Ireland'
  'Bash/Shell (all shells);Go;HTML/CSS;Java;JavaScript;Python;TypeScript']
 ['45-54 years old' 'Employed, full-time' 'Remote' ... '27'
  'United Kingdom of Great Britain and Northern Ireland' 'C#']
 ...
 ['25-34 years old' 'Employed, full-time' 'In-person' ... '5'
  'United States of America' 'HTML/CSS;JavaScript;TypeScript']
 ['18-24 years old' 'Employed, full-time'
  'Hybrid (some remote, some in-person)' ... '2' 'Germany'
  'C;C++;Go;Lua;Objective-C;Python;Rust;SQL']
 ['18-24 years old' 'Student, full-time'
  'Hybrid (some remote, some in-person)' ... '2'
  'United States of America'
  'C;HTML/CSS;Java;JavaScript;PHP;Python;TypeScript']]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

### Splitting the dataset into the Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)