# Importing dependencies

In [1]:
import pandas as pd
import numpy as np
from datetime import date,datetime,timedelta
import re

## Going through the data 

In [2]:
df = pd.read_csv('naukri_scrapped.csv')

In [3]:
df.head()

Unnamed: 0,Organization,Searched_as,Job_Title,Location_and_work_mode,Posted,Scrapped_Date,Experience,Salary_Range
0,Persistent,data scientist,Machine Learning (AI) Architect,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed
1,Oprable Inc,data scientist,Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,"Noida, Kolkata, Mumbai, Chandigarh, Hyderabad/...",Just Now,2023-06-26,1-6 Yrs,Not disclosed
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,"Hybrid - Noida, Uttar Pradesh, Hyderabad/ Secu...",19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,"Hybrid - Gurgaon/ Gurugram, Haryana, Bangalore...",19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed


In [4]:
df.shape

(4000, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Organization            4000 non-null   object
 1   Searched_as             4000 non-null   object
 2   Job_Title               4000 non-null   object
 3   Location_and_work_mode  4000 non-null   object
 4   Posted                  4000 non-null   object
 5   Scrapped_Date           4000 non-null   object
 6   Experience              4000 non-null   object
 7   Salary_Range            4000 non-null   object
dtypes: object(8)
memory usage: 250.1+ KB


In [6]:
df['Salary_Range'].value_counts()

Not disclosed               3601
4-6 Lacs PA                   31
10-20 Lacs PA                 16
20-35 Lacs PA                 14
15-30 Lacs PA                 12
                            ... 
40-55 Lacs PA                  1
70,000-1.25 Lacs PA            1
3-6.5 Lacs PA                  1
10-13 Lacs PA                  1
Less than 5,000-5,000 PA       1
Name: Salary_Range, Length: 151, dtype: int64

In [7]:
df['Posted'].value_counts()

30+ Days Ago     1818
4 Days Ago        169
3 Days Ago        129
26 Days Ago       125
6 Days Ago        115
5 Days Ago        107
10 Days Ago       106
13 Days Ago       106
12 Days Ago       103
2 Days Ago        100
11 Days Ago        97
25 Days Ago        92
19 Days Ago        79
18 Days Ago        77
9 Days Ago         76
1 Day Ago          74
20 Days Ago        72
17 Days Ago        62
27 Days Ago        58
24 Days Ago        51
16 Days Ago        48
30 Days Ago        42
23 Days Ago        40
7 Days Ago         39
Just Now           37
14 Days Ago        30
28 Days Ago        28
29 Days Ago        26
Few Hours Ago      23
22 Days Ago        21
8 Days Ago         20
15 Days Ago        18
21 Days Ago        12
Name: Posted, dtype: int64

# Working on Dates 

### Date Posted 

Defining a function to get the number of days from the posted column

In [8]:
def days_ago(text):

    if text == 'Just Now' or text == 'Few Hours Ago':
        return 0
    else:
        number = re.findall(r'\d+',text)[0]
        text = int(number)
        return text

In [9]:
# days_ago('1 Day Ago ')

# today = datetime.today()

# posting_date = today - timedelta(days=test('30+ days ago'))

# posting_date.strftime("%b %d,%Y")

In [10]:
df['Days_ago'] = df['Posted'].apply(days_ago)

Defining a function to get the month and date with respect to current date

In [11]:
def posting_date(days_):
    
    today = datetime.today()
    posting_date = today - timedelta(days=days_)
    
    return posting_date

In [12]:
df['Posting_date'] = df['Days_ago'].apply(posting_date)
df['Posting_month'] = df['Posting_date'].dt.strftime('%B') # First getting the month from the Posting Date
df['Posting_date'] = df['Posting_date'].dt.strftime('%B %d,%Y') # Then formatting the Posting Date into Month day, Year format.

# If the Posting_date column is formatted first then getting the month from it will throw an error

In [13]:
df.head()

Unnamed: 0,Organization,Searched_as,Job_Title,Location_and_work_mode,Posted,Scrapped_Date,Experience,Salary_Range,Days_ago,Posting_date,Posting_month
0,Persistent,data scientist,Machine Learning (AI) Architect,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed,30,"May 28,2023",May
1,Oprable Inc,data scientist,Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed,3,"June 24,2023",June
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,"Noida, Kolkata, Mumbai, Chandigarh, Hyderabad/...",Just Now,2023-06-26,1-6 Yrs,Not disclosed,0,"June 27,2023",June
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,"Hybrid - Noida, Uttar Pradesh, Hyderabad/ Secu...",19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed,19,"June 08,2023",June
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,"Hybrid - Gurgaon/ Gurugram, Haryana, Bangalore...",19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed,19,"June 08,2023",June


In [14]:
# Dropping the Days_ago column

df.drop('Days_ago',axis=1,inplace=True)

In [15]:
# df.columns

In [16]:
# rearranged_cols = ['Organization', 'Searched_as', 'Job_Title', 'Location_and_work_mode', 'Posted', 'Posting_date',
#                    'Posting_month','Scrapped_Date', 'Experience', 'Salary_Range']
# df = df[rearranged_cols]

In [17]:
df.head()

Unnamed: 0,Organization,Searched_as,Job_Title,Location_and_work_mode,Posted,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month
0,Persistent,data scientist,Machine Learning (AI) Architect,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May
1,Oprable Inc,data scientist,Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,"Noida, Kolkata, Mumbai, Chandigarh, Hyderabad/...",Just Now,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,"Hybrid - Noida, Uttar Pradesh, Hyderabad/ Secu...",19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,"Hybrid - Gurgaon/ Gurugram, Haryana, Bangalore...",19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June


## Work Modes and Locations 

Defining a function to get the work mode from the column

In [18]:
def work_mode(text):

    if 'Hybrid' in text:
        return text.split('-')[0].replace(' ','')
    elif 'Temp. WFH' in text:
        return text.split('-')[0].replace(' ','')
    else:
        return 'WFO'
    

In [19]:
df['Work_mode'] = df['Location_and_work_mode'].apply(work_mode)

In [20]:
df['Work_mode'].value_counts()

WFO         3808
Hybrid       160
Temp.WFH      32
Name: Work_mode, dtype: int64

In [21]:
df.head(50)

Unnamed: 0,Organization,Searched_as,Job_Title,Location_and_work_mode,Posted,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month,Work_mode
0,Persistent,data scientist,Machine Learning (AI) Architect,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May,WFO
1,Oprable Inc,data scientist,Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June,WFO
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,"Noida, Kolkata, Mumbai, Chandigarh, Hyderabad/...",Just Now,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June,WFO
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,"Hybrid - Noida, Uttar Pradesh, Hyderabad/ Secu...",19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June,Hybrid
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,"Hybrid - Gurgaon/ Gurugram, Haryana, Bangalore...",19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June,Hybrid
5,Blackbuck,data scientist,Data Scientist,"Gurgaon/Gurugram, Bangalore/Bengaluru",30+ Days Ago,2023-06-26,3-7 Yrs,Not disclosed,"May 28,2023",May,WFO
6,Paytm,data scientist,Data Science - Technical Lead,"Noida, Bangalore/Bengaluru",30+ Days Ago,2023-06-26,6-8 Yrs,Not disclosed,"May 28,2023",May,WFO
7,Paytm,data scientist,Data Science - Senior Software Engineer,"Noida, Bangalore/Bengaluru",30+ Days Ago,2023-06-26,3-6 Yrs,Not disclosed,"May 28,2023",May,WFO
8,AVE Promagne,data scientist,Sr. Data Scientist - Python / ML / DL,"Noida, Mumbai, Chandigarh, Hyderabad/Secundera...",4 Days Ago,2023-06-26,5-8 Yrs,Not disclosed,"June 23,2023",June,WFO
9,Analytos,data scientist,Junior Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,0-2 Yrs,Not disclosed,"May 28,2023",May,WFO


Defining a function to get the first of all the Locations

In [22]:
def location(text):
    if 'Hybrid' in text:
        text = text.split('-')[1]
        loc = text.split(',')[0].replace(' ','')
        return loc
    elif 'Temp. WFH' in text:
        text = text.split('-')[1]
        loc = text.split(',')[0].replace(' ','')
        return loc
    else:
        loc = text.split(',')[0].replace(' ','')
        return loc

A function to clean the location column


In [23]:
def cleaned_loc(text):
    
    if 'Bangalore/Bengaluru' in text:
        return 'Bengaluru'
    elif 'Gurgaon/Gurugram' in text or 'Gurgaon' in text:
        return 'Gurugram'
    elif 'Hyderabad/Secunderabad' in text:
        return 'Hyderabad'
    elif 'Delhi' in text or 'delhi' in text:
        return 'Delhi'
    elif 'GreaterNoida' in text:
        return 'Noida'
    elif 'Mumbai' in text:
        return 'Mumbai'
    elif 'Cochin' in text or 'Ernakulam' in text:
        return 'Kochi'
    elif 'remote' in text:
        return 'Remote'
    else:
        return text.capitalize()

In [24]:
df['Location'] = df['Location_and_work_mode'].apply(location)
df['Location'] = df['Location'].apply(cleaned_loc)


In [25]:
df.head()

Unnamed: 0,Organization,Searched_as,Job_Title,Location_and_work_mode,Posted,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month,Work_mode,Location
0,Persistent,data scientist,Machine Learning (AI) Architect,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May,WFO,Kolkata
1,Oprable Inc,data scientist,Data Scientist,"Kolkata, Mumbai, New Delhi, Hyderabad/Secunder...",3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June,WFO,Kolkata
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,"Noida, Kolkata, Mumbai, Chandigarh, Hyderabad/...",Just Now,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June,WFO,Noida
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,"Hybrid - Noida, Uttar Pradesh, Hyderabad/ Secu...",19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Noida
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,"Hybrid - Gurgaon/ Gurugram, Haryana, Bangalore...",19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Gurugram


In [26]:
# Dropping the Location and work mode column
df.drop('Location_and_work_mode',axis=1,inplace=True)

In [27]:
pd.DataFrame(df['Job_Title'].value_counts()).head(50)

Unnamed: 0,Job_Title
Data Analyst,462
Data Scientist,361
Machine Learning Engineer,116
Senior Data Scientist,98
Record To Report Ops Associate-Record To Report,52
Machine Learning AI Platform Engineer,34
Lead Data Scientist,29
Deputy Area Manager - B2C Underwriting Rural,28
Senior Machine Learning Engineer,28
Team Lead(AI/ML/Data Learning),27


## The Job Title

Defining a function to get the general job title

In [28]:
def job_title(text):
    
    text = text.lower()
    
    if 'scientist' in text or 'science' in text:
        return 'Data Scientist'
    elif 'analy' in text : #or 'analytics' in text:
        return 'Data Analyst'
    elif 'data engineer' in text or 'service' in text:
        return 'Data Engineer'
    elif 'machine learning' in text or 'ml' in text:
        return 'ML Engineer'
    elif 'nlp' in text or 'deep' in text:
        return 'DL Engineer'
    elif 'ai' in text or 'artificial' in text:
        return 'AI Ops'
    elif 'ops' in text:
        return 'ML Ops'
    elif 'python' in text:
        return 'Python Developer'
    elif 'software' in text:
        return 'Software Engineer'
    else:
        return text.capitalize()


In [29]:
df['Title'] = df['Job_Title'].apply(job_title)

In [30]:
pd.DataFrame(df['Title'].value_counts()).head(30)

Unnamed: 0,Title
Data Scientist,1352
Data Analyst,1020
ML Engineer,765
ML Ops,278
AI Ops,133
Data Engineer,109
Deep Learning Engineer,46
Deputy area manager - b2c underwriting rural,28
"Hiring freshers /experience for global mnc for hr , accounts finance,",24
Deputy area manager - b2c underwriting,14


In [31]:
df.head()

Unnamed: 0,Organization,Searched_as,Job_Title,Posted,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month,Work_mode,Location,Title
0,Persistent,data scientist,Machine Learning (AI) Architect,30+ Days Ago,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May,WFO,Kolkata,ML Engineer
1,Oprable Inc,data scientist,Data Scientist,3 Days Ago,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June,WFO,Kolkata,Data Scientist
2,Dreambig It Solutions India,data scientist,Data Scientist/ Senior Data Scientist/ Manager...,Just Now,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June,WFO,Noida,Data Scientist
3,Axtria,data scientist,Patient Analytics and AI/ML/Databricks,19 Days Ago,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Noida,Data Analyst
4,Evalueserve,data scientist,Looking For NLP/Deep Learning Data Scientist F...,19 Days Ago,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Gurugram,Data Scientist


columns to drop:
Job_title,
Posted

In [32]:
df.drop(['Job_Title','Posted'],axis=1,inplace=True)

In [33]:
df.head()

Unnamed: 0,Organization,Searched_as,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month,Work_mode,Location,Title
0,Persistent,data scientist,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May,WFO,Kolkata,ML Engineer
1,Oprable Inc,data scientist,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June,WFO,Kolkata,Data Scientist
2,Dreambig It Solutions India,data scientist,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June,WFO,Noida,Data Scientist
3,Axtria,data scientist,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Noida,Data Analyst
4,Evalueserve,data scientist,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Gurugram,Data Scientist


In [34]:
df[['Minimum_experience', 'Maximum_experience']] = df['Experience'].str.extract(r'(\d+)-(\d+)')
df[['Minimum_experience', 'Maximum_experience']] = df[['Minimum_experience', 'Maximum_experience']].astype(float)

In [35]:
df.head()

Unnamed: 0,Organization,Searched_as,Scrapped_Date,Experience,Salary_Range,Posting_date,Posting_month,Work_mode,Location,Title,Minimum_experience,Maximum_experience
0,Persistent,data scientist,2023-06-26,5-12 Yrs,Not disclosed,"May 28,2023",May,WFO,Kolkata,ML Engineer,5.0,12.0
1,Oprable Inc,data scientist,2023-06-26,0-1 Yrs,Not disclosed,"June 24,2023",June,WFO,Kolkata,Data Scientist,0.0,1.0
2,Dreambig It Solutions India,data scientist,2023-06-26,1-6 Yrs,Not disclosed,"June 27,2023",June,WFO,Noida,Data Scientist,1.0,6.0
3,Axtria,data scientist,2023-06-26,4-9 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Noida,Data Analyst,4.0,9.0
4,Evalueserve,data scientist,2023-06-26,7-10 Yrs,Not disclosed,"June 08,2023",June,Hybrid,Gurugram,Data Scientist,7.0,10.0


In [38]:
df.columns

Index(['Organization', 'Searched_as', 'Scrapped_Date', 'Experience',
       'Salary_Range', 'Posting_date', 'Posting_month', 'Work_mode',
       'Location', 'Title', 'Minimum_experience', 'Maximum_experience'],
      dtype='object')

In [39]:
rearranged_cols = ['Organization', 'Searched_as','Title', 'Work_mode', 'Location', 
                   'Posting_date', 'Posting_month','Scrapped_Date',
                   'Experience','Minimum_experience', 'Maximum_experience','Salary_Range']

In [40]:
df = df[rearranged_cols]

In [41]:
df.shape

(4000, 12)

In [42]:
df.head()

Unnamed: 0,Organization,Searched_as,Title,Work_mode,Location,Posting_date,Posting_month,Scrapped_Date,Experience,Minimum_experience,Maximum_experience,Salary_Range
0,Persistent,data scientist,ML Engineer,WFO,Kolkata,"May 28,2023",May,2023-06-26,5-12 Yrs,5.0,12.0,Not disclosed
1,Oprable Inc,data scientist,Data Scientist,WFO,Kolkata,"June 24,2023",June,2023-06-26,0-1 Yrs,0.0,1.0,Not disclosed
2,Dreambig It Solutions India,data scientist,Data Scientist,WFO,Noida,"June 27,2023",June,2023-06-26,1-6 Yrs,1.0,6.0,Not disclosed
3,Axtria,data scientist,Data Analyst,Hybrid,Noida,"June 08,2023",June,2023-06-26,4-9 Yrs,4.0,9.0,Not disclosed
4,Evalueserve,data scientist,Data Scientist,Hybrid,Gurugram,"June 08,2023",June,2023-06-26,7-10 Yrs,7.0,10.0,Not disclosed


### Saving as csv

In [43]:
df.to_csv('naukri_cleaned.csv',index=False)

In [44]:
job_titles = ['Data Scientist','Data Analyst','Data Engineer','ML Engineer','Deep Learning Engineer',
              'AI Engineer','ML Ops','Python Developer','Software Engineer']

a = df[df['Title'].isin(job_titles)]

In [45]:
a.shape

(3595, 12)