# Evaluation Portfolio (Task - 3)

## Demonstrating Data Analytics Pipeline (collection, cleaning, transformation and visualisation)
### Submited By - Khwnasat Giri Narzary

### Step 1 - Data Collection/Loading

In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading data from local storage
original = pd.read_csv("job_placement_data.csv")
original.sample(6)

Unnamed: 0,id,name,gender,age,degree,stream,college_name,placement_status,salary,gpa,years_of_experience,skills
240,241,Olivia Smith,Female,24.0,Bachelor's,Information Technology,University of Illinois--Urbana-Champaign,Placed,65000,3.8,3.0,"Java, C++, Problem Solving"
470,471,Mia Wilson,Female,23.0,Bachelor's,Electrical Engineering,University of Texas--Dallas,Placed,63000,3.6,1.0,"Networking, Cyber Security, Linux"
292,293,Chloe Hernandez,Female,26.0,Bachelor's,Electrical Engineering,University of Texas--Dallas,Placed,63000,3.6,1.0,"Machine Learning, AI, Deep Learning"
133,134,Sophia Price,Female,,Bachelor's,Electronics and Communication,University of Illinois--Urbana-Champaign,Placed,65000,3.8,3.0,"Python, SQL, Data Analysis"
286,287,Emma Martinez,Female,26.0,Bachelor's,Mechanical Engineering,University of Rochester,Placed,62000,3.8,3.0,"Python, SQL, Data Analysis"
638,639,Ava Lee,Female,24.0,Bachelor's,Information Technology,University of California--Santa Cruz,Placed,60000,3.7,2.0,"Java, C++, Problem Solving"


#### Checking Data for inconsistency or issues

In [3]:
# Making an independent copy of our dataset
data = original.copy()

# Checking shape of data (rows,columns)
data.shape

(700, 12)

In [4]:
# Checking datatypes 
data.dtypes.reset_index()

Unnamed: 0,index,0
0,id,int64
1,name,object
2,gender,object
3,age,float64
4,degree,object
5,stream,object
6,college_name,object
7,placement_status,object
8,salary,int64
9,gpa,float64


In [29]:
# Checking for Null/Missing Values
data.isnull().sum()

id             0
name           0
gender         0
age           70
degree         0
stream         0
college        0
status         0
salary         0
gpa            0
experience     0
skills         0
dtype: int64

#### Found missing value in "years_of_experience" and "Age"

In [None]:
# Total Columns in dataset
data.columns


(699, 12)

In [7]:
# Displaying unique values in each necessary columns

columns = ['gender', 'age', 'degree', 'stream', 'college_name','placement_status','years_of_experience']
print("==================== S T A R T =======================")

for i in columns:
    print(data[i].value_counts())
    print("======================================================")

gender
Female    366
Male      334
Name: count, dtype: int64
age
24.0    187
23.0    180
26.0    169
25.0     94
Name: count, dtype: int64
degree
Bachelor's    700
Name: count, dtype: int64
stream
Computer Science                 214
Information Technology           152
Electrical Engineering           112
Mechanical Engineering           111
Electronics and Communication    111
Name: count, dtype: int64
college_name
University of Michigan--Ann Arbor            43
University of California--Berkeley           43
University of Colorado--Boulder              43
University of Illinois--Urbana-Champaign     43
University of Virginia                       43
University of North Carolina--Chapel Hill    42
University of Maryland--College Park         42
University of California--Santa Cruz         42
University of Pennsylvania                   40
University of Washington                     40
University of California--Riverside          34
University of California--San Francisco      34
Uni

In [8]:
# checking for duplicate values
data.duplicated().sum()

0

### Findings/suggestions for next step: 
1. Found missing values in (Age:70, years_of_experience:1)
2. Datatypes can be changed 
3. Columns can be renamed
4. Skills column can be normalised

### Step 2 & 3 - Data Cleaning and Transformation

In [9]:
# Renaming columns for easier use 
data.rename(columns={"years_of_experience":"experience"}, inplace=True)
data.rename(columns={"college_name":"college"}, inplace=True)
data.rename(columns={"placement_status":"status"},inplace=True)
data.columns

Index(['id', 'name', 'gender', 'age', 'degree', 'stream', 'college', 'status',
       'salary', 'gpa', 'experience', 'skills'],
      dtype='object')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          700 non-null    int64  
 1   name        700 non-null    object 
 2   gender      700 non-null    object 
 3   age         630 non-null    float64
 4   degree      700 non-null    object 
 5   stream      700 non-null    object 
 6   college     700 non-null    object 
 7   status      700 non-null    object 
 8   salary      700 non-null    int64  
 9   gpa         700 non-null    float64
 10  experience  699 non-null    float64
 11  skills      700 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 65.8+ KB


In [11]:
# Converting datatypes 
# 1. All object to string

for i in data:
    if data[i].dtypes == "object":
        data[i] = data[i].astype("string")


# 2. Age to int
data["age"] = data["age"].astype("Int64")


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          700 non-null    int64  
 1   name        700 non-null    string 
 2   gender      700 non-null    string 
 3   age         630 non-null    Int64  
 4   degree      700 non-null    string 
 5   stream      700 non-null    string 
 6   college     700 non-null    string 
 7   status      700 non-null    string 
 8   salary      700 non-null    int64  
 9   gpa         700 non-null    float64
 10  experience  699 non-null    float64
 11  skills      700 non-null    string 
dtypes: Int64(1), float64(2), int64(2), string(7)
memory usage: 66.4 KB


In [13]:
# Option 1 - Removing Row with Null value in "experience" column
data = data.dropna(subset="experience")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 699 entries, 0 to 699
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          699 non-null    int64  
 1   name        699 non-null    string 
 2   gender      699 non-null    string 
 3   age         629 non-null    Int64  
 4   degree      699 non-null    string 
 5   stream      699 non-null    string 
 6   college     699 non-null    string 
 7   status      699 non-null    string 
 8   salary      699 non-null    int64  
 9   gpa         699 non-null    float64
 10  experience  699 non-null    float64
 11  skills      699 non-null    string 
dtypes: Int64(1), float64(2), int64(2), string(7)
memory usage: 71.7 KB


In [14]:
# Option 2 - Replacing with Mean, Median or Mode
data.describe()

Unnamed: 0,id,age,salary,gpa,experience
count,699.0,629.0,699.0,699.0,699.0
mean,350.221745,24.400636,52463.519313,3.750501,2.177396
std,202.227533,1.162744,25176.734034,0.121283,0.779393
min,1.0,23.0,0.0,3.4,1.0
25%,175.5,23.0,61000.0,3.7,2.0
50%,350.0,24.0,64000.0,3.8,2.0
75%,524.5,26.0,66000.0,3.9,3.0
max,700.0,26.0,68000.0,3.9,3.0


In [15]:
# Checking total count of each age
data["age"].value_counts()

age
24    186
23    180
26    169
25     94
Name: count, dtype: Int64

In [16]:
# New dataframe with only null values
null_data = data[data["age"].isnull()]

In [17]:
# data distribution of age based on experience
null_data["experience"].value_counts()

experience
2.0    30
3.0    22
1.0    18
Name: count, dtype: int64

In [18]:
g = data.groupby("age")["experience"].value_counts()