# Data Pre-Processing

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("freshers_final cleaned_data.csv")

In [4]:
df

Unnamed: 0,job_title,Experience,Company,Salary,Location
0,Software Developer / Engineer Jobs Opening in ...,0 Years,Rayapuri Media OPC Pvt Ltd,20000 - 40000 Monthly,Hyderabad
1,Python Junior Software Engineer Web Scraping A...,0 to 1 Years,Insytelli services,25000 - 35000 Monthly,Bangalore
2,Data Science Jobs Opening in Faclon at Mumbai,0 Years,Faclon,Salary not disclosed,Mumbai
3,Game Development Jobs Opening in Creative Gali...,0 Years,Creative Galileo,Salary not disclosed,Pune
4,Cloud Computing Intern Jobs Opening in Quantas...,0 Years,Quantasis Private Limited,Salary not disclosed,Mumbai
...,...,...,...,...,...
1236,Web Developer Jobs Opening in Client of Freshe...,0 Years,Client of Freshersworld,50000 - 75000 Monthly,Noida
1237,Web Developer Jobs Opening in Client of Freshe...,0 Years,Client of Freshersworld,50000 - 75000 Monthly,Chennai
1238,Web Developer Jobs Opening in Client of Freshe...,0 Years,Client of Freshersworld,50000 - 75000 Monthly,Mumbai
1239,Web Developer Jobs Opening in Client of Freshe...,0 Years,Client of Freshersworld,50000 - 75000 Monthly,Gurgaon


## Checking Null value

In [5]:
df.isnull().sum()

job_title      2
Experience     2
Company        2
Salary         2
Location      44
dtype: int64

## Removing the common null value in all columns

In [6]:
df=df.dropna(how="all")

In [7]:
df.isna().sum()

job_title      0
Experience     0
Company        0
Salary         0
Location      42
dtype: int64

## Checking duplicates

In [8]:
df.duplicated().sum()

15

## Remove duplicates

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

In [11]:
df['job_title'].unique()

array(['Software Developer / Engineer Jobs Opening in Rayapuri Media OPC Pvt Ltd at Panjagutta, Hyderabad',
       'Python Junior Software Engineer Web Scraping And Automation Entry Level Jobs Opening in Insytelli services at Sahakara Nagar, Bangalore',
       'Data Science Jobs Opening in Faclon at Mumbai', ...,
       'Web Developer Jobs Opening in Client of Freshersworld at Chennai',
       'Web Developer Jobs Opening in Client of Freshersworld at Mumbai',
       'Web Developer Jobs Opening in Client of Freshersworld at Delhi'],
      dtype=object)

In [12]:
df['Experience'].unique()

array(['0 Years', '0 to 1 Years', '0 to 3+ Years', '3+ Years',
       '2 to 3+ Years', '3 to 3+ Years', '1 to 3+ Years', '1 to 2 Years',
       '1 to 2.5 Years', '2.5 to 3+ Years', '1.5 Years', '1 Years',
       '1.5 to 3+ Years', '0 to 3 Years', '0.6 to 3+ Years',
       '0 to 2 Years', '2 Years', '1 to 3 Years', '0.6 to 3 Years',
       '0.6 Years', '0.6 to 2 Years', '2 to 3 Years', '0.6 to 1 Years',
       '0 to 0.6 Years'], dtype=object)

#### Replacing specific ranges of experience with standardized values

In [13]:
df['Experience'] = df['Experience'].str.replace('0 Years', '0')
df['Experience'] = df['Experience'].str.replace('0 to 1 Years', '1')
df['Experience'] = df['Experience'].str.replace('0 to 3+ Years', '3')
df['Experience'] = df['Experience'].str.replace('2 to 3+ Years', '3')
df['Experience'] = df['Experience'].str.replace('3 to 3+ Years', '4')
df['Experience'] = df['Experience'].str.replace('1 to 3+ Years', '2')
df['Experience'] = df['Experience'].str.replace('1 to 2 Years', '1')
df['Experience'] = df['Experience'].str.replace('1 to 2.5 Years', '2')
df['Experience'] = df['Experience'].str.replace('2.5 to 3+ Years', '3')
df['Experience'] = df['Experience'].str.replace('1.5 Years', '1')
df['Experience'] = df['Experience'].str.replace('1 Years', '1')
df['Experience'] = df['Experience'].str.replace('1.5 to 3+ Years', '2')
df['Experience'] = df['Experience'].str.replace('0.6 to 3+ Years', '2')
df['Experience'] = df['Experience'].str.replace('0 to 3 Years', '2')
df['Experience'] = df['Experience'].str.replace('0 to 2 Years', '1')
df['Experience'] = df['Experience'].str.replace('2 Years', '2')
df['Experience'] = df['Experience'].str.replace('1 to 3 Years', '2')
df['Experience'] = df['Experience'].str.replace('0.6 to 3 Years', '1')
df['Experience'] = df['Experience'].str.replace('0.6 Years', '0')
df['Experience'] = df['Experience'].str.replace('0.6 to 2 Years', '1')
df['Experience'] = df['Experience'].str.replace('2 to 3 Years', '3')
df['Experience'] = df['Experience'].str.replace('0.6 to 1 Years', '1')
df['Experience'] = df['Experience'].str.replace('0 to 0.6 Years', '0')
df['Experience'] = df['Experience'].str.replace('0.6 to 2', '1')
df['Experience'] = df['Experience'].str.replace('3+ Years', '4')
df['Experience'] = df['Experience'].str.replace('0.6 to 1', '1')
df['Experience'] = df['Experience'].str.replace('0 to 0', '0')

In [14]:
df['Experience'].unique()

array(['0', '1', '3', '4', '2'], dtype=object)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1224 entries, 0 to 1240
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   job_title   1224 non-null   object
 1   Experience  1224 non-null   object
 2   Company     1224 non-null   object
 3   Salary      1224 non-null   object
 4   Location    1182 non-null   object
dtypes: object(5)
memory usage: 57.4+ KB


### Coverting the datatype

In [16]:
df['Experience']=df['Experience'].astype("int64")

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1224 entries, 0 to 1240
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   job_title   1224 non-null   object
 1   Experience  1224 non-null   int64 
 2   Company     1224 non-null   object
 3   Salary      1224 non-null   object
 4   Location    1182 non-null   object
dtypes: int64(1), object(4)
memory usage: 57.4+ KB


In [18]:
df['Salary'].value_counts()

Salary
25000 - 50000 Monthly    289
Salary not disclosed     200
56000 - 76000 Monthly    200
55000 - 75000 Monthly    197
45000 - 65000 Monthly     99
                        ... 
40000 Monthly              1
11111 Monthly              1
5000 - 7500 Monthly        1
2000 - 15000 Monthly       1
20000 - 26000 Monthly      1
Name: count, Length: 87, dtype: int64

In [19]:
df['Salary']=df['Salary'].replace("Salary not disclosed",np.nan)

In [20]:
df.head(5)

Unnamed: 0,job_title,Experience,Company,Salary,Location
0,Software Developer / Engineer Jobs Opening in ...,0,Rayapuri Media OPC Pvt Ltd,20000 - 40000 Monthly,Hyderabad
1,Python Junior Software Engineer Web Scraping A...,1,Insytelli services,25000 - 35000 Monthly,Bangalore
2,Data Science Jobs Opening in Faclon at Mumbai,0,Faclon,,Mumbai
3,Game Development Jobs Opening in Creative Gali...,0,Creative Galileo,,Pune
4,Cloud Computing Intern Jobs Opening in Quantas...,0,Quantasis Private Limited,,Mumbai


In [21]:
df['Salary'].isnull().sum()

200

In [22]:
df['Salary'].unique()

array(['20000 - 40000 Monthly', '25000 - 35000 Monthly', nan,
       '10000 - 15000 Monthly', '5000 Monthly', '25000 - 40000 Monthly',
       '45000 - 100000 Monthly', '7000 - 20000 Monthly',
       '15000 - 20000 Monthly', '15000 - 16000 Monthly',
       '20000 - 25000 Monthly', '25000 - 30000 Monthly',
       '30000 - 35000 Monthly', '20000 - 50000 Monthly',
       '30000 - 40000 Monthly', '5000 - 15000 Monthly',
       '29000 - 47000 Monthly', '2000000 - 3200000 Yearly',
       '10000 - 12000 Monthly', '13000 - 15000 Monthly',
       '15000 - 25000 Monthly', '10000 Monthly', '22000 - 23000 Monthly',
       '20000 - 30000 Monthly', '2000 - 40000 Monthly',
       '100000 - 125000 Monthly', '15000 - 18000 Monthly',
       '2000 - 3000 Monthly', '5000 - 8000 Monthly',
       '5000 - 10000 Monthly', '50000 - 70000 Monthly',
       '6000 - 12000 Monthly', '40000 - 80000 Monthly', '8000 Monthly',
       '30000 - 70000 Monthly', '25000 - 37500 Monthly',
       '16000 - 22000 Monthly', '4000

### Extracting the yearly data only

In [23]:
df[df['Salary'].str.contains("Yearly",na=False)]

Unnamed: 0,job_title,Experience,Company,Salary,Location
95,SFDC Tester Jobs Opening in Vy systems at Asho...,4,Vy systems,2000000 - 3200000 Yearly,Hyderabad
181,Senior WordPress Developer Jobs Opening in CXO...,0,CXO Enterprises Private Limited,400000 - 600000 Yearly,Bangalore
271,Sr. Python / Lead Engineer - REMOTE Jobs Openi...,4,Anvayainfosolutions,1000000 - 3000000 Yearly,Bangalore
275,Software Trainee Engineer Jobs Opening in Tale...,0,TalentBridge Technologies Pvt Ltd,300000 Yearly,Bangalore
276,Software Engineer Trainee Jobs Opening in Tale...,0,TalentBridge Technologies Pvt Ltd,300000 Yearly,Bangalore
306,Junior Salesforce Developer Jobs Opening in To...,0,Torrance Consultancy,150000 - 200000 Yearly,Hyderabad


### Replace the yearly data to monthly

In [24]:
df['Salary'] = df['Salary'].str.replace('2000000 - 3200000 Yearly','170000 - 270000 Monthly')
df['Salary'] = df['Salary'].str.replace('400000 - 600000 Yearly','34000 - 50000 Monthly')
df['Salary'] = df['Salary'].str.replace('1000000 - 3000000 Yearly','83000 - 250000 Monthly')
df['Salary'] = df['Salary'].str.replace('300000 Yearly','25000 Monthly')
df['Salary'] = df['Salary'].str.replace('150000 - 200000 Yearly','12500 - 17000 Monthly')



In [25]:
df['Salary'].str.contains("Yearly").sum()

0

In [26]:
df['Salary'].unique()

array(['20000 - 40000 Monthly', '25000 - 35000 Monthly', nan,
       '10000 - 15000 Monthly', '5000 Monthly', '25000 - 40000 Monthly',
       '45000 - 100000 Monthly', '7000 - 20000 Monthly',
       '15000 - 20000 Monthly', '15000 - 16000 Monthly',
       '20000 - 25000 Monthly', '25000 - 30000 Monthly',
       '30000 - 35000 Monthly', '20000 - 50000 Monthly',
       '30000 - 40000 Monthly', '5000 - 15000 Monthly',
       '29000 - 47000 Monthly', '170000 - 270000 Monthly',
       '10000 - 12000 Monthly', '13000 - 15000 Monthly',
       '15000 - 25000 Monthly', '10000 Monthly', '22000 - 23000 Monthly',
       '20000 - 30000 Monthly', '2000 - 40000 Monthly',
       '100000 - 125000 Monthly', '15000 - 18000 Monthly',
       '2000 - 3000 Monthly', '5000 - 8000 Monthly',
       '5000 - 10000 Monthly', '50000 - 70000 Monthly',
       '6000 - 12000 Monthly', '40000 - 80000 Monthly', '8000 Monthly',
       '30000 - 70000 Monthly', '25000 - 37500 Monthly',
       '16000 - 22000 Monthly', '34000

In [27]:
df['Salary']=df['Salary'].str.replace("Monthly","")

In [28]:
df['Salary']

0       20000 - 40000 
1       25000 - 35000 
2                  NaN
3                  NaN
4                  NaN
             ...      
1235    50000 - 75000 
1236    50000 - 75000 
1237    50000 - 75000 
1238    50000 - 75000 
1240    50000 - 75000 
Name: Salary, Length: 1224, dtype: object

In [31]:
df['job_title'].value_counts()

job_title
Python Developer Jobs Opening in ADITHYA IT SOLUTIONS PRIVATE LTD at Salem                                                                                 2
Network Engineer Jobs Opening in SMART ENTRY IT INFRASTRUCTURE at Anna Nagar, Chennai                                                                      2
Software Developer / Engineer Jobs Opening in Rayapuri Media OPC Pvt Ltd at Panjagutta, Hyderabad                                                          1
Application Developer Jobs Opening in Cassius Technologies at Leh                                                                                          1
Application Developer Jobs Opening in Cassius Technologies at Wayanad                                                                                      1
Application Developer Jobs Opening in Cassius Technologies at Ooty                                                                                         1
Application Developer Jobs Opening in Cassius Te

In [32]:
df["job_title"]=df["job_title"].str.split("Jobs").str[0]

In [34]:
df['job_title'] = df['job_title'].replace('Beginning Your Career With Dot Net Developer ', 'Dot Net Developer')
df['job_title'] = df['job_title'].replace('Start Your Career With Frontend Developer ', 'Frontend Developer')
df['job_title'] = df['job_title'].replace('Start Your Career With Software Testing Internship/ Training ', 'Software Testing Internship/ Training')
df['job_title'] = df['job_title'].replace('Walk-In For System Operator ', 'System Operator')
df['job_title'] = df['job_title'].replace('Immediate Walkin Drive For Software Tester In Salem ', 'Software Tester')
df['job_title'] = df['job_title'].replace('We Are Hiring For IMS ', 'IMS')
df['job_title'] = df['job_title'].replace('We Are Hiring Data Analyst In Salem ', 'Data Analyst')
df['job_title'] = df['job_title'].replace('We Are Hiring For IMS ', 'IMS')
df['job_title'] = df['job_title'].replace('Beginning Your Career With Dot Net Developer ', 'Dot Net Developer')

In [36]:
df['job_title'] = df['job_title'].replace('System Administrator In Salem ', 'System Administrator')
df['job_title'] = df['job_title'].replace('System Operator In Salem ', 'System Operator')
df['job_title'] = df['job_title'].replace('Urgent Required Java Intern Fresher ', 'Java Intern')
df['job_title'] = df['job_title'].replace('.NET Developer - 1 Year Experience ', 'Dot Net Developer')
df['job_title'] = df['job_title'].replace('HE pvt ltd Raipur Remote Work From Home Recruitment 2024 | Apply Online', 'HE Remote Work')
df['job_title'] = df['job_title'].replace('We Are Hiring For IMS Role At Salem ', 'IMS')
df['job_title'] = df['job_title'].replace('.NET Developer ', 'Dot Net Developer')

In [37]:
df['job_title'].unique()

array(['Software Developer / Engineer ',
       'Python Junior Software Engineer Web Scraping And Automation Entry Level ',
       'Data Science ', 'Game Development ', 'Cloud Computing Intern ',
       '3D Unity Development ', 'Systems Development Engineer ',
       'Front End Development ', 'Flutter Development ',
       'Web Development ', 'Android App Development ', 'Big Data ',
       'Artificial Intelligence - AI ', 'Laravel Development Intern ',
       'Flutter Developer Intern ', 'WordPress Development ',
       '.NET Development ', 'Reactjs Development ',
       'AWS DevOps Engineer ', 'PHP And Laravel Developers ',
       'Python Development ', 'Embedded Systems ', 'QA Testing ',
       'SAP HANA (S/4) Consulting ', 'PHP Development ',
       'Large Language Model Researcher ', 'Full Stack Development ',
       'Data Analytics ', 'JavaScript Developer ',
       'System & Network Administration ', 'Database Development ',
       'Mobile App Development ', 'Network Engineering 

In [40]:
df['job_title'].value_counts()

job_title
PHP Programmer                                                              258
Java Developer                                                              112
Software Engineer                                                           109
Web Developer                                                               102
Application Developer                                                       100
Web Designer                                                                100
Front End Developer                                                          98
Automation Test Engineer                                                     33
Full Stack Developer                                                         11
Software Developer                                                            8
PHP Developer                                                                 8
Python Development                                                            6
IT Software Engineer          

In [41]:
df['Location'].value_counts()

Location
Bangalore                      58
Delhi                          47
Mumbai                         44
Pune                           40
Chennai                        37
Hyderabad                      34
Salem                          33
Ahmedabad                      17
Kolkata                        15
Gurgaon                        15
Noida                          12
Surat                          11
Tirupati                       10
Nellore                         9
Chandigarh                      9
Itanagar                        8
Tawang                          8
Port Blair                      8
Guntur                          8
Patna                           8
Vijayawada                      8
Nagpur                          8
Nicobar                         8
Jaipur                          8
Madurai                         8
South Goa                       7
Raipur                          7
Silchar                         7
Coimbatore                      7
Visak

## To display Maximum rows in a data

In [38]:
pd.set_option("display.max_rows",None)
df

Unnamed: 0,job_title,Experience,Company,Salary,Location
0,Software Developer / Engineer,0,Rayapuri Media OPC Pvt Ltd,20000 - 40000,Hyderabad
1,Python Junior Software Engineer Web Scraping A...,1,Insytelli services,25000 - 35000,Bangalore
2,Data Science,0,Faclon,,Mumbai
3,Game Development,0,Creative Galileo,,Pune
4,Cloud Computing Intern,0,Quantasis Private Limited,,Mumbai
5,3D Unity Development,0,Perceived Design,,Bangalore
6,Systems Development Engineer,0,COSGrid Systems Private Limited,,Chennai
7,Front End Development,0,BUCKMINT,,Delhi
8,Flutter Development,0,HealthFlex,,Bangalore
9,Flutter Development,0,Antino Labs Private Limited,,Delhi


In [39]:
df.to_csv("freshers_world.csv",index=False)