# Machine Learning Project

## 1. Import the required libraries and load data

In [37]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


hr_df = pd.read_csv("https://bit.ly/2ODZvLCHRDataset")


## 2. Data cleaning and Preparation:
a. View first few records
b. Check the shape of data
c. Check for and deal with missing values
d. Check for and deal with duplicates

In [38]:
# Check the first 10 records

hr_df.head(10)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
5,58896,Analytics,region_2,Bachelor's,m,sourcing,2,31,3.0,7,0,0,85,0
6,20379,Operations,region_20,Bachelor's,f,other,1,31,3.0,5,0,0,59,0
7,16290,Operations,region_34,Master's & above,m,sourcing,1,33,3.0,6,0,0,63,0
8,73202,Analytics,region_20,Bachelor's,m,other,1,28,4.0,5,0,0,83,0
9,28911,Sales & Marketing,region_1,Master's & above,m,sourcing,1,32,5.0,5,1,0,54,0


In [39]:
# Check the shape of the data frame

hr_df.shape

(54808, 14)

In [40]:
# Check for missing values

hr_df.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [41]:
# Replace the missing values in previous_year_rating column using the mean of the column
# Recheck missing values

hr_df["previous_year_rating"].fillna(hr_df["previous_year_rating"].mean(), inplace = True)

hr_df.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [42]:
# Check for duplicates

hr_df.duplicated().sum()

0

## 3. Splitting the data into training and validation sets

In [43]:
features = hr_df.drop(['is_promoted','department','region','education','gender','recruitment_channel'], axis=1)
target = hr_df['is_promoted']

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

print(features_train.shape)
print(features_valid.shape)
print(target_train.shape)
print(target_valid.shape)


(41106, 8)
(13702, 8)
(41106,)
(13702,)


## 4. Train the model and predict

In [44]:
model = DecisionTreeClassifier()

model.fit(features, target)

predicted_valid = model.predict(features_valid)

print(predicted_valid)

[0 0 0 ... 1 1 0]
