<a href="https://colab.research.google.com/github/iyoushe1703/glassdoor-data-science-jobs-predictor/blob/main/Ken_Jee.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
pd.set_option('display.max_columns', 3000)
pd.set_option('max_rows', 1000)

In [31]:
df = pd.read_csv("glassdoor_jobs.csv")

# Cleaning Salary Estimates Column

#### Removing missing values

In [32]:
# method 1
#df.drop(df.index[df['Salary Estimate'] == '-1'])

In [33]:
# method 2
df = df[df['Salary Estimate'] != '-1']

In [None]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [34]:
df['Salary Estimate']

0                     $53K-$91K (Glassdoor est.)
1                    $63K-$112K (Glassdoor est.)
2                     $80K-$90K (Glassdoor est.)
3                     $56K-$97K (Glassdoor est.)
4                    $86K-$143K (Glassdoor est.)
5                    $71K-$119K (Glassdoor est.)
6                     $54K-$93K (Glassdoor est.)
7                    $86K-$142K (Glassdoor est.)
8                     $38K-$84K (Glassdoor est.)
9                   $120K-$160K (Glassdoor est.)
10                  $126K-$201K (Glassdoor est.)
11                   $64K-$106K (Glassdoor est.)
12                  $106K-$172K (Glassdoor est.)
13                    $46K-$85K (Glassdoor est.)
14                   $83K-$144K (Glassdoor est.)
15                  $102K-$190K (Glassdoor est.)
16                   $67K-$137K (Glassdoor est.)
17                  $118K-$189K (Glassdoor est.)
18                  $110K-$175K (Glassdoor est.)
19                   $64K-$111K (Glassdoor est.)
20                  

#### Removing "Glassdoor Estimate"

In [35]:
df['Salary Estimate'] = df['Salary Estimate'].apply(lambda x: x.split('(')[0])

#### Removing $ and K

In [36]:
df['Salary Estimate'] = df['Salary Estimate'].apply(lambda x: x.replace('$', '').replace('K', ''))

#### Creating a column to indicate 'per hour' and 'Employer Provided Salary' entries in the ```Salary Estimate``` column

In [37]:
df['Per Hour'] = df['Salary Estimate'].apply(lambda x: 1 if 'Per Hour'.lower() in x.lower() else 0)

In [38]:
df['employer_provided'] = df['Salary Estimate'].apply(lambda x: 1 if 'employer provided salary'.lower() in x.lower() else 0)

### Removing Employer Provided Salary from Salary Estimate

In [39]:
df['Salary Estimate'] = df['Salary Estimate'].apply(lambda x: x.lower().replace("per hour", ""))

In [40]:
df['Salary Estimate'] = df['Salary Estimate'].apply(lambda x: x.lower().replace("employer provided salary:", ""))

#### Creating ```min_salary```, ```max_salary``` and ```avg_salary``` columns

In [41]:
df['min_salary'] = df['Salary Estimate'].apply(lambda x: x.split('-')[0])

In [42]:
df['min_salary']

0       53
1       63
2       80
3       56
4       86
5       71
6       54
7       86
8       38
9      120
10     126
11      64
12     106
13      46
14      83
15     102
16      67
17     118
18     110
19      64
20      81
21      73
22      86
23      63
25     109
26      63
27      75
28      34
29      63
30      80
31      56
32      72
33      86
34      93
35      85
36      77
37      82
38      83
39     115
40      74
41      64
43      68
44     110
45      52
46     110
48     150
49     158
50      20
51      56
52      63
54      68
55      86
56      41
57      86
58      80
59      56
60     120
61     111
62      54
63      71
65      82
66      84
67     107
68      49
70      61
72      88
73      60
75      41
76      96
77      65
78      52
79     139
80      50
81      85
82      74
83      99
84      79
85      57
86      83
87      86
88      94
89      37
90     100
91      55
92      60
93      39
94     106
95      86
96      64
97      31
99      34

In [43]:
df['min_salary'] = df['min_salary'].astype(int)

In [44]:
df['max_salary'] = df['Salary Estimate'].apply(lambda x: x.split('-')[-1])

In [45]:
df['max_salary'] = df['max_salary'].astype(int)

In [46]:
df["avg_salary"] = (df['min_salary'] + df['max_salary'])/2

### Cleaning ```Company Name```

In [47]:
df['company_txt'] = df['Company Name'].apply(lambda x: x.split("\n")[0])

### Separating State from city names

In [48]:
df['job_state'] = df['Location'].apply(lambda x: x[-2:])

### Make a new column that indicates whether the job location is at the company headquarters

#### ```np.select``` is more efficient than ```.apply()```

In [49]:
import numpy as np

condition = [df['Location'] == df['Headquarters'],
             df['Location'] != df['Headquarters']]

outputs = [1, 0]

df['job_at_hq'] = np.select(condition, outputs)

### Company age


In [None]:
df['Founded'].apply(lambda x: x if x < 1 else 2020 - x)

# Parsing Job Description

In [62]:
# checking if python is in the job description
df['python'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)

In [71]:
# checking if R is in the job description
df['rstudio'] = df['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() else 0)

In [72]:
df['spark'] = df['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)

In [73]:
df['aws'] = df['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)

In [None]:
df['excel'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)

# Export cleaned ```df``` to a .csv file

In [81]:
df.to_csv("glassdoor_cleaned.csv", index = False)