import sys
!{sys.executable} -m pip install plotly
!{sys.executable} -m pip install statsmodels

In [64]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import statsmodels.api as sm
import matplotlib.pylab as plt
import seaborn as sns

The data is saved as a CSV-file online so here I can just read it as it is. 

In [65]:
df = pd.read_csv("https://raw.githubusercontent.com/HaliaeetusAlbicilla/k24salary/master/kode24salary.csv")

Familiarizing with the dataset

In [66]:
df.head()

Unnamed: 0,Hva er din alder?,"Hvor mange år relevant, formell utdannelse har du?",Hvor mange års relevant arbeidserfaring har du?,Hva beskriver best din arbeidssituasjon?,I hvilket fylke jobber du?,Hva jobber du mest med?,"Hva er din grunnlønn? (årslønn før skatt, uten eventuelle bonuser eller overtidsbetaling)",Har du en bonusordning?,Alt i alt - sier du deg fornøyd med din egen lønn?
0,25-29,3,1,"in-house, startup",Oslo,"fullstack, web","300 000,00 kr",ja,Nei
1,25-29,5,2,konsulent,Trøndelag,"fullstack, web","350 000,00 kr",ja,Ja
2,30-34,5,10,konsulent,Vestland,"backend, web","350 000,00 kr",ja,Ja
3,40-44,5,18,konsulent,Rogaland,"fullstack, web","360 000,00 kr",ja,Ja
4,25-29,3,1,"in-house, privat sektor",Nordland,"backend, web","370 000,00 kr",nei,Ja


Cleaning up the column names to simplify

In [67]:
df.columns = [
    "age",
    "education",
    "experience",
    "work_situation",
    "no_county",
    "work_field",
    "wage",
    "bonus",
    "satisfied",
]
df.head()

Unnamed: 0,age,education,experience,work_situation,no_county,work_field,wage,bonus,satisfied
0,25-29,3,1,"in-house, startup",Oslo,"fullstack, web","300 000,00 kr",ja,Nei
1,25-29,5,2,konsulent,Trøndelag,"fullstack, web","350 000,00 kr",ja,Ja
2,30-34,5,10,konsulent,Vestland,"backend, web","350 000,00 kr",ja,Ja
3,40-44,5,18,konsulent,Rogaland,"fullstack, web","360 000,00 kr",ja,Ja
4,25-29,3,1,"in-house, privat sektor",Nordland,"backend, web","370 000,00 kr",nei,Ja


In [68]:
df.shape

(1203, 9)

In [69]:
df.nunique(axis=0)

age                 9
education          17
experience         43
work_situation      5
no_county          12
work_field         16
wage              334
bonus               2
satisfied           2
dtype: int64

In [70]:
df.work_situation.value_counts()

in-house, privat sektor                  552
konsulent                                426
in-house, offentlig/kommunal sektor      127
in-house, startup                         68
frilans / selvstendig næringsdrivende     30
Name: work_situation, dtype: int64

In [71]:
df.work_field.value_counts()

fullstack, web             369
backend, web               197
frontend, web              193
programvare                 74
devops                      54
arkitektur                  50
ledelse/administrativt      49
embedded/IOT/maskinvare     46
app                         45
data science                30
automatisering              19
annet                       18
sikkerhet                   17
databaser                   17
UX / design                 14
testing                     11
Name: work_field, dtype: int64

Most people work in-house in private sector and least people work independetly. 

Now let's split up the age so that we do not have an interval and also the wage input need to be changed so that we do not have "kr" at the end. 

In [72]:
# Splitting the age column into two by splitting on "-" operator. N=1 specifies number of max separations and 
# expand = True returns a data frame with different value in different columns if True. Also, we drop the old age column
df[["young_age", "old_age"]] = df["age"].str.split("-", n=1, expand=True)
df = df.drop("age", axis=1)

In [73]:
# Now let's fix the wage column
print(df.wage.max())
df.wage = df.wage.str.replace(" kr", "")
df.head()


999 000,00 kr


Unnamed: 0,education,experience,work_situation,no_county,work_field,wage,bonus,satisfied,young_age,old_age
0,3,1,"in-house, startup",Oslo,"fullstack, web","300 000,00",ja,Nei,25,29
1,5,2,konsulent,Trøndelag,"fullstack, web","350 000,00",ja,Ja,25,29
2,5,10,konsulent,Vestland,"backend, web","350 000,00",ja,Ja,30,34
3,5,18,konsulent,Rogaland,"fullstack, web","360 000,00",ja,Ja,40,44
4,3,1,"in-house, privat sektor",Nordland,"backend, web","370 000,00",nei,Ja,25,29


Okay, also need to fix the punctuation and spaces.

In [74]:
df.wage = df.wage.str.replace(",", ".")

In [75]:
print(df.dtypes)

education         object
experience        object
work_situation    object
no_county         object
work_field        object
wage              object
bonus             object
satisfied         object
young_age         object
old_age           object
dtype: object


In [76]:
df.wage = df.wage.str.replace(" ", "")

In [77]:
df.wage[1]

'350000.00'

In [78]:
df.head()

Unnamed: 0,education,experience,work_situation,no_county,work_field,wage,bonus,satisfied,young_age,old_age
0,3,1,"in-house, startup",Oslo,"fullstack, web",300000.0,ja,Nei,25,29
1,5,2,konsulent,Trøndelag,"fullstack, web",350000.0,ja,Ja,25,29
2,5,10,konsulent,Vestland,"backend, web",350000.0,ja,Ja,30,34
3,5,18,konsulent,Rogaland,"fullstack, web",360000.0,ja,Ja,40,44
4,3,1,"in-house, privat sektor",Nordland,"backend, web",370000.0,nei,Ja,25,29


In [79]:
df.wage = df.wage.astype(float)

In [80]:
df.wage.max()

2600000.0

Wow. Who is this person?

In [81]:
df[df.wage == 2600000.0]

Unnamed: 0,education,experience,work_situation,no_county,work_field,wage,bonus,satisfied,young_age,old_age
1202,5,15,frilans / selvstendig næringsdrivende,Oslo,arkitektur,2600000.0,nei,Ja,35,39


In [82]:
# Let's include this to simply output graphs in html

def figwrite(filename):
    
    fig.write_html(
        f"{filename}.html",
        include_plotlyjs="cdn",
    )

##Distribution 

Now that the data is more or less clean, let's have a look at distribution

In [83]:
import plotly.express as px

fig = px.histogram(df, x="wage")

fig.update_xaxes(fixedrange=True)
fig.update_yaxes(fixedrange=True)

fig.show()

figwrite("distribution")

In [84]:
df.wage.mean()
df.wage.median()
df.wage.max()
df.wage.min()

300000.0