In [257]:
import pandas as pd

In [258]:
data = pd.read_csv('data/jobs.csv',encoding = 'latin1')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3198 entries, 0 to 3197
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Company                     3197 non-null   object
 1   Job Title                   3197 non-null   object
 2   Location                    3197 non-null   object
 3   Job Type                    3197 non-null   object
 4   Experience level            2962 non-null   object
 5   Salary                      3009 non-null   object
 6   Requirment of the company   3198 non-null   object
 7   Facilities                  3198 non-null   object
dtypes: object(8)
memory usage: 200.0+ KB


In [259]:
data.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


<h4>Cleanning</h4>

In [260]:
#check duplicated
data.duplicated().sum()

202

In [261]:
#remove duplicated
data.drop_duplicates(inplace=True)

In [262]:
#check null in data
data.isna().sum()

Company                         1
Job Title                       1
Location                        1
Job Type                        1
Experience level              228
Salary                        172
Requirment of the company       0
Facilities                      0
dtype: int64

In [263]:
#Drop rows null
data.drop(index=data.loc[data['Company'].isna()].index,inplace=True)

In [264]:
#Replace data null of Experience column with 'No experience'
data['Experience level'].fillna('No experience',inplace=True)

In [265]:
#Replace data null of Salary column with 'No salary'
data['Salary'].fillna('0',inplace=True)

In [271]:
import re

def cleanSalary(salary):
    currency_paterns = re.compile("["
                                  u"GBP"
                                  u"EUR"
                                  u"+"
                                  u"K"
                                  u"*"
                                  "]+",flags=re.UNICODE)

    return int(currency_paterns.sub(r'',salary))*1000
 
data['Salary'] = data['Salary'].apply(lambda x:cleanSalary(x))

In [300]:
def getCountryFromLocation(location):
    locations = location.split(',')
    if len(locations) > 0:
        return locations[len(locations)-1]
    else:
       return locations


data['country'] = data['Location'].apply(lambda x:getCountryFromLocation(x))

In [301]:
#Export data cleaned to csv
data.to_csv('data/cleaned_jobs.csv')

<h4>Analyse and visualization</h4>

In [315]:
data = pd.read_csv('data/cleaned_jobs.csv',index_col=0)

data.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,country
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48000,"Computer Science,Data quality,Genetics,Mathema...",",,,,",United States
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48000,"Agile,Data management,Finance,Security,,",",,,,",Mauritius
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,No experience,90000,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,",United States
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48000,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,",Italy
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108000,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi...",United States


In [309]:
data.groupby('country')['country'].count().sort_values(ascending=False).reset_index(name='counts').head()

Unnamed: 0,country,counts
0,United States,478
1,United Kingdom,176
2,India,168
3,France,125
4,CA,94


In [310]:
#Job title more required
data.groupby('Job Title')['Job Title'].count().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,Job Title,counts
0,Data Engineer,104
1,Data Scientist,82
2,Data Analyst,77
3,Senior Data Engineer,63
4,Machine Learning Engineer,47
...,...,...
2133,"Data Scientist, Poland",1
2134,"Data Scientist, Payments ML Accelerator",1
2135,"Data Scientist, Marketing & Sales",1
2136,"Data Scientist, In-car Ads",1


In [102]:
#Companies have more opportunities
data.groupby('Company')['Company'].count().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,Company,counts
0,Publicis Groupe,113
1,Bosch Group,93
2,Amazon.com,57
3,Block,47
4,Visa,42
...,...,...
1101,Infopro Digital,1
1102,InnovaFeed,1
1103,Insite AI,1
1104,Instabase,1


In [103]:
#Jop Typs
data.groupby('Job Type')['Job Type'].count().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,Job Type,counts
0,Full Time,2917
1,Internship,72
2,Part Time,6


In [104]:
#Opportunities by location
data.groupby('Location')['Location'].count().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,Location,counts
0,"Bengaluru, India",77
1,Remote,62
2,"Paris, France",61
3,"New York City, United States",59
4,"London, England, United Kingdom",55
...,...,...
1112,London or remote (U.K),1
1113,"London, England, GBR",1
1114,"London, Paris, Amsterdam",1
1115,"London, United Kingdom; Copenhagen, Denmark; R...",1


In [105]:
#Experience level
data.groupby('Experience level')['Experience level'].count().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,Experience level,counts
0,Senior-level,1746
1,Entry-level,462
2,Mid-level,438
3,No experience,227
4,Executive-level,122
