# **Introduction**

From Amazon, Microsoft, Google to Wayfair, the technology industry is currently shaken by massive layoffs since mid-2022.

This dataset includes information on 450+ technology companies; and has the potential to gain insights on technology industry trends and make informed decisions for a career or business.

In this notebook we will look at EDA of Employee layoffs.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib
matplotlib.rcParams.update({'font.size': 22})

In [2]:
FILEPATH = "G:/CODE_playground/py/data_science/colab notebooks/tech layoffs analysis/data/tech_layoffs.csv"
techlayoff_data = FILEPATH

df = pd.read_csv(techlayoff_data)
df

Check for value anomalities

In [3]:
from pandas.api.types import is_numeric_dtype

num_cols = ['total_layoffs','impacted_workforce_percentage']
cols = df.columns

for i in cols:
    if is_numeric_dtype(df[i]) == False:
        print(str(i) + " is NOT numeric type")
    else:
        print(str(i) + " is numeric type")

# Data cleaning and preprocessing

In [4]:
# check for null values
df.isnull().sum()

In [5]:
df.info()

It looks like all columns have string characters. We need to be careful about this while on EDA

In [6]:
# drop 'additional_notes' column
df.drop(columns=['additional_notes'], inplace=True)
df.columns

In [7]:
df['reported_date'] = df['reported_date'].astype('datetime64')
df['Month'] = df['reported_date'].dt.month
df['Year'] = df['reported_date'].dt.strftime('%Y')
df.info()

Clean 'industry' column:

In [8]:
# Before
df['industry']

In [9]:
# ind1 = []
# ind2 = []

# for i in df['industry'].to_list():
#   ind1.append(i.split(', '))
#   ind2.append(i.split(', ')[-1])

In [10]:
# After
ind = []
for i in df['industry'].to_list():
    ind.append(i.split(' ')[-1])
    
ind = [x.lower() for x in ind]
ind = [x.replace("-", "") for x in ind]

df['industry'] = ind
df['industry']

Clean 'headquarter_location' column:

In [11]:
# Before
df['headquarter_location']

In [12]:
# After
ind = []
for i in df['headquarter_location'].to_list():
    ind.append(i.split(',')[-1])
    
ind = [x.lower() for x in ind]
ind = [x.replace("-", "") for x in ind]
ind = [x.replace(" ", "") for x in ind]

df['headquarter_location'] = ind
df['headquarter_location']

In [13]:
df['total_layoffs'] = df['total_layoffs'].apply(lambda x: 0 if not x.isnumeric() else int(x))
df['total_layoffs']

In [14]:
# visualize the distribution of a dataset
plt.figure(figsize=(10,8))
sns.kdeplot(x=df['total_layoffs'])
plt.title('Number of Layoffs')
plt.show()

In [15]:
# convert data type
df['impacted_workforce_percentage'] = df['impacted_workforce_percentage'].apply(lambda x: 0 if not x.isnumeric() else int(x))
df['impacted_workforce_percentage']

In [16]:
df.info()

Now the data is ready for exploration.

In [17]:
df.to_csv("tech_layoffs_clean.csv")

# Data Exploration and Visualization

In [18]:
top_10 = df.sort_values('total_layoffs', ascending=False).head(10)
top_10

In [19]:
# !pip install matplotlib --upgrade --user

In [20]:
print('matplotlib: {}'.format(matplotlib.__version__))

### Top 10 Layoffs By Company Name

In [21]:
fig, ax = plt.subplots(figsize=(25, 10))

p=sns.barplot(x="total_layoffs", y="company", data=top_10,
              ax=ax)
for container in p.containers:
    p.bar_label(container,padding=-80)
    
plt.title("Top 10 Company Layoffs")
plt.xlabel('Layoffs')
plt.ylabel('Company Name')
plt.show()

### Top 10 Layoffs By Industry

In [22]:
fig, ax = plt.subplots(figsize=(25, 10))

# top_10_industries = df['industry'].value_counts().head(10)
top_10_industries = df.groupby('industry').size().sort_values(ascending=False).head(10)
top_10_industries = top_10_industries.to_frame().reset_index()

p = sns.barplot(x=0, y='industry', data=top_10_industries, ax=ax)
for container in p.containers:
    p.bar_label(container, padding=-40)
    print(container)
    
plt.title("Top 10 Layoffs By Industry")
plt.ylabel('Industry')
plt.xlabel('Lay offs')
plt.show()

### Top 10 Layoffs By HQ Location

In [23]:
fig, ax = plt.subplots(figsize=(25, 10))
# top_10_locations = df['headquarter_location'].value_counts().head(10)
top_10_locations = df.groupby('headquarter_location').size().sort_values(ascending=False).head(10)
top_10_locations = top_10_locations.to_frame().reset_index()

p = sns.barplot(y='headquarter_location', x=0, data=top_10_locations, ax=ax)
for container in p.containers:
    p.bar_label(container, padding=-45)
    
plt.title("Top 10 Layoffs By HQ Location")
plt.ylabel("headquarter_location")
plt.xlabel('Layoffs')
plt.show()

### Layoffs By IPO status

In [24]:
fig, ax = plt.subplots(figsize=(25, 5))
df_status = df.groupby('status').size().sort_values(ascending=False)
df_status = df_status.to_frame().reset_index()

p = sns.countplot(y='status', data=df, ax=ax)
for container in p.containers:
    p.bar_label(container, padding=-40)
plt.title("Private vs Public Layoffs")
plt.ylabel('Number of Layoffs')
plt.xlabel('IPO status')
plt.show()

### Layoffs By Year

In [25]:
fig, ax = plt.subplots(figsize=(25, 8))
p = sns.countplot(x="Year", data=df, ax=ax)

for container in p.containers:
    p.bar_label(container, padding=-50)

plt.title("Layoffs in 2022 and 2023")
plt.ylabel("Number of Layoffs")
plt.xlabel('Year')
plt.show()

In [26]:
# group the data by month and layoffs; and calculate the total layoffs
month_layoffs = df.groupby(["Month"])["total_layoffs"].count().reset_index()
month_layoffs

In [27]:
plt.figure(figsize=(10,8))
sns.kdeplot(x=df['Month'])
plt.title('Distribution of Layoffs per Month')
plt.show()

### Layoffs By Month

In [28]:
import calendar
plt.figure(figsize=(20, 10))

# build line chart
sns.lineplot(x="Month", y="total_layoffs", data=gg)
plt.title("Layoffs Trends")
plt.ylabel("Layoffs")
plt.xlabel("Month")

# set x-tick labels as the names of months
plt.xticks(range(1,13), [calendar.month_name[i] for i in range(1,13)], rotation=20)
plt.show()