In [241]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from geopy.geocoders import Nominatim


In [242]:
# Importing data
df= pd.read_csv('layoffs.csv')

print(df.head())
print(df.shape)
# look for NA's


      company       location   industry  total_laid_off  percentage_laid_off  \
0      Amazon        Seattle     Retail          8000.0                 0.02   
1  Salesforce    SF Bay Area      Sales          8000.0                 0.10   
2       Vimeo  New York City   Consumer             NaN                 0.11   
3     Harappa      New Delhi  Education            60.0                 0.30   
4   ByteDance       Shanghai   Consumer             NaN                 0.10   

         date     stage        country  funds_raised  
0  2023-01-04       IPO  United States         108.0  
1  2023-01-04       IPO  United States          65.0  
2  2023-01-04       IPO  United States         450.0  
3  2023-01-03  Acquired          India           NaN  
4  2023-01-03   Unknown          China        9400.0  
(1860, 9)


In [243]:
# look for NA's
print(df.isna().sum())

company                  0
location                 0
industry                 6
total_laid_off         565
percentage_laid_off    605
date                     1
stage                    4
country                  0
funds_raised           141
dtype: int64


In [244]:
# check for duplicates
df[df.duplicated()]

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
1003,Cazoo,London,Transportation,750.0,0.15,2022-06-07,IPO,United Kingdom,2000.0


In [245]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [246]:
# Deal with industry Naans by doing a google search
df[df['industry'].isnull()]

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
107,DataRails,Tel Aviv,,30.0,0.18,2022-12-05,Series B,Israel,103.0
139,Motional,Boston,,,,2022-11-30,Unknown,United States,
151,Proton.ai,Boston,,,,2022-11-29,Series A,United States,20.0
261,InfluxData,SF Bay Area,,65.0,0.27,2022-11-10,Series D,United States,119.0
275,AvantStay,Los Angeles,,144.0,0.22,2022-11-09,Private Equity,United States,686.0
306,Exodus,Nebraska City,,59.0,0.22,2022-11-04,Unknown,United States,60.0


In [247]:
# filling in industry NA's with correct data
df.loc[df['company']=='DataRails','industry'] = 'Finance'
df.loc[df['company']=='Motional','industry'] = 'Transportation'
df.loc[df['company']=='Proton.ai','industry'] = 'Sales'
df.loc[df['company']=='InfluxData','industry'] = 'Product'
df.loc[df['company']=='AvantStay','industry'] = 'Travel'
df.loc[df['company']=='Exodus','industry'] = 'Crypto'

In [248]:
# Filling in the 4 stage column NA's with unknown
df['stage'] = df['stage'].fillna('Unknown')

In [249]:
# looking at the observation with the NA in the date variable
df[df['date'].isnull()]

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
1859,Homebound,SF Bay Area,Real Estate,,,,Unknown,United States,128.0


In [250]:
# removing Homebound observation since it has too much missing data
df = df[df.company != 'Homebound']

In [251]:
# convert date column to panda datetime series and add month, year, and month_year column rows
df['date'] = df['date'].apply(pd.to_datetime)
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['month_year'] = df['date'].apply(lambda x: x.strftime('%B-%Y')) 
df['quarter'] = df['date'].dt.to_period('Q')

# and changing the month and year columns to objects for categorical visualizations
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)



In [252]:

# removing all rows that have NA in both columns(total_laid_off and percentage_laid_off) since I will need 
# at least one of these variables for the observation to be useful

bothNA = df[["total_laid_off", "percentage_laid_off"]].isnull().all(axis=1) 
df = df.loc[~bothNA, :]


In [253]:
# checking how many NA's are left
df.isna().sum()

company                  0
location                 0
industry                 0
total_laid_off         277
percentage_laid_off    317
date                     0
stage                    0
country                  0
funds_raised           106
month                    0
year                     0
month_year               0
quarter                  0
dtype: int64

In [254]:
# If a company is missing either total laid off or percentage laid off it can be calculated if the company shows
# up again in the dataset AND has both values populated. First I need to calculate total number of employees
df['total_employees'] = df['total_laid_off']/df['percentage_laid_off']

In [255]:
# How many inf, NA or zero values in the new variable total_employees?

print(f" The number of total_employees observations with NA values is {sum(df.total_employees.isna())}")
print(f" The number of total_employees observations with inf values is {np.isinf(df['total_employees']).values.sum()}")
print(f" The number of total_employee observations with 0 total employees {(df['total_employees'] == 0).sum()}")

 The number of total_employees observations with NA values is 594
 The number of total_employees observations with inf values is 1
 The number of total_employee observations with 0 total employees 0


In [256]:
# replacing inf values with NA
df = df.replace([np.inf, -np.inf], np.nan)

In [257]:
# If company total employees column has an NA Im going to fill it with the value from a differrent row where the total
# employee value is not an NA
df['total_employees'] = df['total_employees'].fillna(df.groupby('company')['total_employees'].transform('mean'))

In [258]:
# fill the percentage_laid_off NA values with total_laid_off values divided by total_employees values for companies
# where I know the total amount of employees
df['percentage_laid_off'] = df.apply(
    lambda row: row['total_laid_off']/row['total_employees'] if np.isnan(row['percentage_laid_off']) else row['percentage_laid_off'],
    axis=1
)

In [259]:
df.isna().sum() # percentage_laid_off now has 70 less NA values (317-247)

company                  0
location                 0
industry                 0
total_laid_off         277
percentage_laid_off    247
date                     0
stage                    0
country                  0
funds_raised           106
month                    0
year                     0
month_year               0
quarter                  0
total_employees        502
dtype: int64

In [260]:
# fill the total_laid_off NA values with percentage_laid_off values multiplied by total_employees values 
# for companies where I know the total amount of employees
df['total_laid_off'] = df.apply(
    lambda row: row['percentage_laid_off']*row['total_employees'] if np.isnan(row['total_laid_off']) else row['total_laid_off'],
    axis=1
)

In [261]:
df.isna().sum() #total laid off now has 23 less NA values (277-254)

company                  0
location                 0
industry                 0
total_laid_off         254
percentage_laid_off    247
date                     0
stage                    0
country                  0
funds_raised           106
month                    0
year                     0
month_year               0
quarter                  0
total_employees        502
dtype: int64

In [262]:
df.describe

<bound method NDFrame.describe of             company        location        industry  total_laid_off  \
0            Amazon         Seattle          Retail          8000.0   
1        Salesforce     SF Bay Area           Sales          8000.0   
2             Vimeo   New York City        Consumer             NaN   
3           Harappa       New Delhi       Education            60.0   
4         ByteDance        Shanghai        Consumer             NaN   
...             ...             ...             ...             ...   
1854        Service     Los Angeles          Travel             NaN   
1855   HopSkipDrive     Los Angeles  Transportation             8.0   
1856    Panda Squad     SF Bay Area        Consumer             6.0   
1857  Tamara Mellon     Los Angeles          Retail            20.0   
1858       EasyPost  Salt Lake City       Logistics            75.0   

      percentage_laid_off       date     stage        country  funds_raised  \
0                   0.020 2023-01-

In [263]:
#Checking the data types
df.dtypes

company                        object
location                       object
industry                       object
total_laid_off                float64
percentage_laid_off           float64
date                   datetime64[ns]
stage                          object
country                        object
funds_raised                  float64
month                          object
year                           object
month_year                     object
quarter                 period[Q-DEC]
total_employees               float64
dtype: object

In [264]:
# There are still a lot of NA's so filling them in with mean values wouldnt be feasible
# The Na's seem to be random so Im going to make 3 different dataframes and decide later which
# is the best to use for what is trying to be accomplished.

# first dataframe will have no NA's and I make a copy of the original df before dropping NAs
df2 = df.copy()

In [265]:
# dropping rows with Na's

df2 = df.copy()
df2 = df2.dropna()

In [266]:
# most of the visualizations will be focusing on the total number laid off so I will drop the observations
# that have NA in this variable 
df_total= df.dropna(subset=['total_laid_off'])


In [267]:
# Ill also be creating visuals focusing on the percentage laid off and will create another dataframe
df_perc= df.dropna(subset=['percentage_laid_off'])

In [268]:
# one more where total employee Na rows are removed
df_emp= df.dropna(subset=['total_employees'])

In [269]:
# Start by looking at layoffs by year

df_layoffs_year = df_total.groupby('year',as_index=False)['total_laid_off'].sum()
fig = px.bar(df_layoffs_year, x='year',y='total_laid_off',
       text_auto='.5s',
       template = 'simple_white',
       title="Layoffs by Year",
       category_orders = {'year': ['2020', '2021', '2022', '2023']}, 
       labels={'year': '', 'total_laid_off':'Amount of Layoffs'})

fig.update_traces( textposition='outside')
fig.update_layout(title_x=0.5)


In [270]:
df_total.dtypes

company                        object
location                       object
industry                       object
total_laid_off                float64
percentage_laid_off           float64
date                   datetime64[ns]
stage                          object
country                        object
funds_raised                  float64
month                          object
year                           object
month_year                     object
quarter                 period[Q-DEC]
total_employees               float64
dtype: object

In [279]:
# Looking at layoffs by quarter
df_layoffs_quarter = df_total.groupby('quarter',as_index=False)['total_laid_off'].sum()

# adding the date column back in for time series bar chart since 'quarter' object type is period[Q-DEC]
df_layoffs_quarter['date'] = pd.PeriodIndex(df_layoffs_quarter['quarter'], freq='Q').strftime('%m-%Y')


In [280]:
# displaying layoffs by quarter.  Will fix later if I end up using it

fig = px.bar(df_layoffs_quarter, x='date', y="total_laid_off")
fig.show()

In [282]:
# taking a look at when layoffs occured by using a bubble scatterplot

fig = px.scatter(df_total, x="date", y="total_laid_off",
                 template = 'presentation',
                 labels={
                     "date": "",
                     "total_laid_off": "Amount of Layoffs"
                 },
                title="Layoffs from 2020 until 2023")
fig.update_xaxes(dtick="M11",)

fig.show()

In [286]:
# Looking at layoffs by month with a bar chart

fig = px.bar(df_total.sort_values(by='date', ascending=True),
             x='month_year',
             y="total_laid_off",
             template = 'ggplot2',
             title="Layoff Timeline",
             labels={'month_year': '', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5)
fig.show()

In [287]:
# Layoffs by year and industry with stacked barchart

fig = px.bar(df_ind_yr.sort_values(by=['total_laid_off'], ascending=False), x="industry", y="total_laid_off",
             color='year',
             height=700,
            template = 'ggplot2',
             title="Layoffs by Year and Industry",
            category_orders = {'year': ['2020', '2021', '2022', '2023']},
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5)
fig.show()

In [289]:
# Layoffs by year industry with a side by side bar chart instead of stacked
df_ind_yr = df_total.groupby(['industry', 'year']).sum() 
df_ind_yr = df_ind_yr.reset_index()


fig = px.bar(df_ind_yr.sort_values(by=['total_laid_off'], ascending=False), x="industry", y="total_laid_off",
             color='year', barmode='group',
             height=700,
            template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'},
            category_orders = {'year': ['2020', '2021', '2022', '2023']})
fig.update_layout(title_x=0.5)
fig.show()

In [291]:
# There wasnt much layoffs in 2021 so lets look at 2020 compared to 2022



fig = px.bar(df_total[(df_total['year'] == '2020') | (df_total['year'] == '2022')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'seaborn',
             title="2020 vs. 2022 Layoffs by Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5)
fig.show()

In [292]:
# There is only a few weeks in Jan so let's add those layoffs to 2022 to see if it changes the outcome much

df_23is22 = df_total.copy()
df_23is22['year'].mask(df_23is22['year'] == '2023', '2022', inplace=True)
df2_20_23 = df_23is22[df_23is22['year'] != '2021']
df2_20_23.year.unique()

array(['2022', '2020'], dtype=object)

In [293]:
# Bar chart that adds the few weeks of data in 2023 to 2022

fig = px.bar(df_23is22[(df_23is22['year'] == '2020') | (df_23is22['year'] == '2022')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry (Adding 2023 Data to 2022)",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5)
fig.show()

In [294]:
# take a look at the distribution of the percentage of layoffs in a company.  If a company is laying
# off a small percentage of their staff they could just be letting go of their worst performers or extra 
# people hired in 2021
fig = px.box(df_perc, x="year", y="percentage_laid_off",
             color='year',
            category_orders = {'year': ['2020', '2021', '2022', '2023']},
             template = 'seaborn',
             title="Percent of Company's Workforce Laid Off by Year",
             labels={'percentage_laid_off': 'Percent of Company Laid Off', 'year':'Year'})
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)

fig.show()

In [295]:
# Creating a violin chart with the same parameters as the above boxplot to compare which is a better
# visual

fig = px.violin(df_perc, x="year", y="percentage_laid_off", box = True, points = 'all')
fig.show()



In [296]:
# Creating a box plot to look at percentage of a companies workforce is laid off based on what stage of 
# funding for the company

fig = px.box(df_perc, x="stage", y="percentage_laid_off")
fig.show()

In [297]:
# The boxplot above has too much going on so Im creating another with 5 common stages in order

df_perc_stage = df_perc.loc[df_perc['stage'].isin(['Seed','IPO', 'Series A', 'Series B', 'Series C'])]
fig = px.box(df_perc_stage, x="stage", y="percentage_laid_off",
            title = 'Percent of Layoffs per Company based on Funding Stage',
                          color='stage',
            template = 'seaborn',
            labels={'percentage_laid_off': 'Percent of Company Laid Off', 'stage':'Stage'},
            category_orders = {'stage': ['Seed', 'Series A', 'Series B', 'Series C', 'IPO']})
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()


In [87]:
# Boxplot showing what percent of workforce is laid off by company location. 

fig = px.box(df_perc, x="country", y="percentage_laid_off")
fig.show()

In [298]:
# look at the number of layoffs per location and limit it to the top 15 countries
df_layoffs_country = df_total.groupby('country',as_index=False)['total_laid_off'].sum().sort_values('total_laid_off',ascending=False).head(15)
fig = px.bar(df_layoffs_country,x='country', y='total_laid_off',
             text_auto='.4s',
             title='Layoffs per Country',
            template = 'seaborn',
            labels={'total_laid_off': 'Amount of Layoffs', 'country':'Country'},)
fig.update_traces( textposition='outside')

In [88]:
# look at the number of layoffs per location and limit it to the top 15 cities
df_layoffs_loc = df_total.groupby('location',as_index=False)['total_laid_off'].sum().sort_values('total_laid_off',ascending=False).head(15)
fig = px.bar(df_layoffs_loc,x='location', y='total_laid_off',
             text_auto='.4s',
             title='Layoffs per City',
            template = 'seaborn',
            labels={'total_laid_off': 'Amount of Layoffs', 'location':'City'})
fig.update_traces( textposition='outside')

In [299]:
# The 2 location bar charts would work better as a singular sunburst chart


df_city_country = df_total.groupby(['country', 'location'], as_index=False)['total_laid_off'].sum().sort_values('total_laid_off',ascending=False)


fig = px.sunburst(df_city_country, path=['country', 'location'],
                  template='seaborn',

                  values='total_laid_off',)
#                   color='lifeExp', 
#                   hover_data=['iso_alpha'],
#                   color_continuous_scale='RdBu',
#                   color_continuous_midpoint=np.average(df['lifeExp'], weights=df['pop']))
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [301]:
# Previously looked at number of layoffs and now will look at the number companies
# in each industry that had layoffs

df_layoffs_co = df.groupby('industry')[['company']].nunique().sort_values('company',ascending=False)
df_layoffs_co = df_layoffs_co.reset_index()
fig = px.bar(df_layoffs_co,x='industry', y='company',
            title='Number of Companies with Layoffs by Industry',
            template = 'seaborn',
            labels={'company': 'Amount of Companies', 'industry':'Industry'})
fig.update_traces( textposition='outside')

In [96]:
# df_ind_yer = df_total.groupby(['industry', 'year']).sum() 
# df_ind_yer #= df_ind_yr.reset_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_laid_off,percentage_laid_off,funds_raised,total_employees
industry,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aerospace,2020,561.0,1.800000,3101.0,790.588235
Aerospace,2022,125.6,1.160000,301.0,260.000000
Construction,2020,896.0,0.460000,3196.0,15773.544974
Construction,2021,2434.0,1.000000,1600.0,2434.000000
Construction,2022,503.0,1.300000,528.0,2035.038760
...,...,...,...,...,...
Transportation,2020,14656.0,7.535180,141716.2,93019.517905
Transportation,2021,200.0,0.400000,214.2,500.000000
Transportation,2022,15211.0,7.683934,86974.0,155422.379374
Travel,2020,13983.0,7.420000,18047.2,55114.657900


In [97]:
# look at the number companies in each industry that had layoffs and seperate by year instead
# of total layoffs by industry

df_layoffs_co_yr = df.groupby(['industry', 'year'])[['company']].nunique().sort_values('company',ascending=False)
df_layoffs_co_yr = df_layoffs_co_yr.reset_index()
fig = px.bar(df_layoffs_co_yr.sort_values(by=['company'], ascending=False), x="industry", y="company",
             color='year',
             barmode='group',
             height=700,
            template = 'ggplot2',
            category_orders = {'year': ['2020', '2021', '2022', '2023']},
             title="Layoffs by Year and Industry",
            labels={'company': 'Amount of Companies', 'industry':'Industry'})
fig.update_layout(title_x=0.5)
fig.show()

In [302]:
# Show the top 20 companies with the most layoffs by year of layoff


df_co_yr = df.groupby(['company', 'year'])[['total_laid_off']].sum().sort_values('total_laid_off',ascending=False).head(30)
df_co_yr = df_co_yr.reset_index()
fig = px.bar(df_co_yr.sort_values(by=['total_laid_off'], ascending=True), x="company", y="total_laid_off",
             color='year',
             height=700,
            template = 'ggplot2',
             category_orders = {'year': ['2020', '2021', '2022', '2023']},
             title="Layoffs of top 20 Companies by Year",
             labels={'company': 'Company', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [99]:
df.groupby(['company', 'industry']).sum().sort_values('total_laid_off',ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_laid_off,percentage_laid_off,funds_raised,total_employees
company,industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Amazon,Retail,18000.0,0.05,216.0,733333.333333
Meta,Consumer,11000.0,0.13,26000.0,84615.384615
Salesforce,Sales,10090.0,0.129654,260.4,312072.072072
Uber,Transportation,7585.0,0.754527,123500.0,66267.737697
Booking.com,Travel,4601.0,0.262914,0.0,35000.0
Cisco,Infrastructure,4100.0,0.05,2.0,82000.0
Peloton,Fitness,4084.0,0.45,5700.0,24197.435897
Carvana,Transportation,4000.0,0.2,3200.0,39583.333333
Better.com,Real Estate,3900.0,0.42,1810.0,19090.909091
Bytedance,Consumer,3750.0,0.0,26200.0,0.0


In [304]:
# Show the top 20 companies with the most layoffs and their respective industry


df_co_ind = df.groupby(['company', 'industry']).sum().sort_values('total_laid_off',ascending=False).head(20)
df_co_ind = df_co_ind.reset_index()
fig = px.bar(df_co_ind.sort_values(by=['total_laid_off'], ascending=True), x="company", y="total_laid_off",
             color='industry',
             height=700,
            template = 'seaborn',
             # category_orders = {'year': ['2020', '2021', '2022', '2023']},
             title="Companies with the Most Layoffs",
             labels={'company': 'Company', 'total_laid_off':'Total Layoffs', 'industry':'Industry'})
# fig.update_layout(barmode='stack', xaxis={'categoryorder': 'total ascending'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [102]:
# Show the top 20 companies with the most layoffs by year of layoff


df_co_yr = df.groupby(['company', 'year'])[['total_laid_off']].sum().sort_values('total_laid_off',ascending=False).head(25)
df_co_yr = df_co_yr.reset_index()
fig = px.bar(df_co_yr.sort_values(by=['total_laid_off'], ascending=True), x="company", y="total_laid_off",
             color='year',
             height=700,
            template = 'ggplot2',
             category_orders = {'year': ['2020', '2021', '2022', '2023']},
             title="Layoffs of top 25 Companies",
             labels={'company': 'Company', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [305]:
# Pie chart showing percentage of layoffs by industry although probaly wont use
df_ind_yr = df_total.groupby(['industry', 'year']).sum() 
df_ind_yr = df_ind_yr.reset_index()
fig = px.pie(df_ind_yr[df_ind_yr['year'].str.contains('2020')], values='total_laid_off', names='industry', title='Amount of Layoffs by Industry')
fig.show()

In [306]:
# Pie chart showing percentage of layoffs by industry in 2022
df_ind_yr = df_total.groupby(['industry', 'year']).sum() 
df_ind_yr = df_ind_yr.reset_index()
fig = px.pie(df_ind_yr[df_ind_yr['year'].str.contains('2022')], values='total_laid_off', names='industry', title='Amount of Layoffs by Industry')
fig.show()

In [307]:
# Pie chart showing percentage of layoffs by industry in 2020

df_ind = df_total.groupby(['industry']).sum() 
df_ind = df_ind_yr.reset_index()
fig = px.pie(df_ind_yr, values='total_laid_off', names='industry', title='Amount of Layoffs by Industry')
fig.show()

In [308]:
# Bubble Scatterplot showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2, x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'ggplot2',
                 labels={
                     "date": "",
                     "total_laid_off": "Amount of Layoffs"
                 },
                 size_max=30,
                title="Layoffs from 2020 until 2023")
fig.update_xaxes(
    dtick="M11",)

fig.show()

In [309]:
# import plotly.graph_objects as go

# df_3_ind = df_total[df_total['industry'].str.contains('|'.join(['Healthcare', 'Crypto', 'Education']))]

# fig2 = go.Figure()
# sizeref = 2.*max(df_3_ind['total_laid_off'])/(150**2)
# fig2.add_trace(go.Scatter(
#                           x = df_3_ind['date'], y = df_3_ind['industry'],
#                           mode = 'markers',
# #                           name = 'Size = vitamins * Color = sodium',
#                           marker = dict(color = df_3_ind['industry'],
#                          # colorscale = 'portland',
#                           opacity = 0.8,
#                           size = df_3_ind['total_laid_off'],
#                           sizemode = 'area', sizeref= sizeref,
#                           sizemin= 4, showscale = True
#                           )))
# # fig2.update_layout(title = "Layoff Timeline ",
# #                    title_font_size = 40, template = 'seaborn',
# #                    width = 1600, height = 1400)
# # fig2.update_layout(legend=dict(
# # #                                yanchor="top", y=0.99,
# # #                                xanchor="left",x=0.01),
# # #                                legend_font_size= 20,
# #                                showlegend = False)
# # fig2.update_xaxes(title_text = 'Timeline',
# #                   title_font=dict(size=30, family='Verdana', 
# #                                   color='purple'),
# #                   tickfont=dict(family='Calibri', color='black', 
# #                                  size=25))
# # fig2.update_yaxes(title_text = "Industry", 
# # #                   range = (0,80),
# #                   title_font=dict(size=30, family='Verdana', 
# #                                   color='orange'),
# #                   tickfont=dict(family='Calibri', color='black', 
# #                                 size=25))
# # fig2.write_image(path + "figbubble2.png")
# fig2.show()

ValueError: 
    Invalid element(s) received for the 'color' property of scatter.marker
        Invalid elements include: ['Education', 'Education', 'Healthcare', 'Healthcare', 'Education', 'Education', 'Healthcare', 'Healthcare', 'Education', 'Crypto']

    The 'color' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, rebeccapurple, saddlebrown, salmon,
            sandybrown, seagreen, seashell, sienna, silver,
            skyblue, slateblue, slategray, slategrey, snow,
            springgreen, steelblue, tan, teal, thistle, tomato,
            turquoise, violet, wheat, white, whitesmoke,
            yellow, yellowgreen
      - A number that will be interpreted as a color
        according to scatter.marker.colorscale
      - A list or array of any of the above

In [311]:
df_3_ind = df_total[df_total['industry'].str.contains('|'.join(['Healthcare', 'Crypto', 'Education']))]

fig = px.scatter(df_3_ind, 
                 x="date",
                 y="industry",
                 size="total_laid_off",
                 color = 'industry',
                 template = 'gridon',
                 labels={
                     "date": "",
                     "industry": ""
                 },
                 size_max=60,
                title="Layoff Timeline")
fig.update_xaxes(dtick="M11",)
fig.update_yaxes(gridcolor='lightsteelblue')
fig.layout.update(showlegend=False)
fig.show()

In [200]:

# Working with data to change size of bubbles.  If I want the difference to be greater between the smallest
# and largest bubbles I should be able to square the data.  And If I want the bubbles to be more similiar in 
# size I can apply the square root to the data 

# copy the data
df_max_scale = df_total.copy()
  
# apply normalization techniques on Column 1 to increase the size difference between smallest and largest bubble
column = 'total_laid_off'
df_max_scale[column] = df_max_scale[column]**1.2

fig = px.scatter(df_max_scale[df_max_scale['industry'].str.contains('|'.join(['Healthcare', 'Crypto', 'Education']))], 
                 x="date",
                 y="industry",
                 size="total_laid_off",
                 color = 'industry',
                 template = 'gridon',
                 labels={
                     "date": "",
                     "industry": ""
                 },
                size_max=60,
                title="Layoff Timeline")
fig.update_xaxes(dtick="M11",)
fig.update_yaxes(gridcolor='lightsteelblue')
fig.layout.update(showlegend=False)
fig.show()

In [199]:
# same as above but with industries that laid off more in 2020 as opposed to 2022

fig = px.scatter(df_max_scale[df_max_scale['industry'].str.contains('|'.join(['Travel', 'Recruiting']))], x="date",  y="industry",
                 size="total_laid_off",
                 color = 'industry',
                 template = 'gridon',
                 labels={
                     "date": "",
                     "industry": ""
                 },
                size_max=60,
                title="Layoff Timeline")
fig.update_xaxes(dtick="M11",)
fig.update_yaxes(gridcolor='lightsteelblue')
fig.layout.update(showlegend=False)
fig.show()

In [312]:
# Looking at a few more industries 

# copy the data
df_max_scale = df_total.copy()
  
# apply normalization techniques on Column 1 to increase bubble size variation
column = 'total_laid_off'
df_max_scale[column] = df_max_scale[column]**1.1

fig = px.scatter(df_max_scale[df_max_scale['industry'].str.contains('|'.join(['Retail', 'Finance', 'Consumer', 'Fitness']))], 
                 x="date",
                 y="industry",
                 size="total_laid_off",
                 color = 'industry',
                 template = 'gridon',
                 labels={
                     "date": "",
                     "industry": ""
                 },
                size_max=60,
                title="Layoff Timeline")
fig.update_xaxes(dtick="M11",)
fig.update_yaxes(gridcolor='lightsteelblue')
fig.layout.update(showlegend=False)
fig.show()

In [117]:
# # copy the data
# df_max_scaled = df_total.copy()
  
# # apply normalization techniques on Column 1
# column = 'total_laid_off'
# df_max_scaled[column] = df_max_scaled[column]**2

In [315]:
# df_perc_stage = df_perc.loc[df_perc['stage'].isin(['Seed','IPO', 'Series A', 'Series B', 'Series C'])]
# fig = px.box(df_perc_stage, x="stage", y="percentage_laid_off",
#             title = 'Percent of Layoffs per Company based on Funding Stage',
#                           color='stage',
#             template = 'seaborn',
#             labels={'percentage_laid_off': 'Percent of Company Laid Off', 'stage':'Stage'},
#             category_orders = {'stage': ['Seed', 'Series A', 'Series B', 'Series C', 'IPO']})
# fig.layout.yaxis.tickformat = ',.0%'
# fig.layout.update(showlegend=False)
# fig.show()


In [316]:
# Bubble Scatterplot of the Food industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off



fig = px.scatter(df2[df2['industry'].str.contains('Food')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Food Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [317]:
# Bubble Scatterplot of the Travel industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Travel')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Travel Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [318]:
# Bubble Scatterplot of the Crypto industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Crypto')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Crypto Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [210]:
# Bubble Scatterplot of the Education industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Education')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Education Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [211]:
# Bubble Scatterplot of the Healthcare industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Healthcare')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Healthcare Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [233]:
# Bubble Scatterplot of the Recruiting industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Recruiting')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Recruiting from 2020 until 2023")
fig.update_xaxes(dtick="M11")
fig.layout.yaxis.tickformat = ',.0%'
fig.update_yaxes(range=(-.09,1))
fig.layout.update(showlegend=False,)
fig.show()

In [214]:
# Bubble Scatterplot of the Real Estate industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Real Estate')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Real Estate Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [225]:
# Bubble Scatterplot of the Finance industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Finance')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Finance Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.update_yaxes(range=(-.09,1))
fig.layout.update(showlegend=False)
fig.show()

In [217]:
# Bubble Scatterplot of the Transportation industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Transportation')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Transportation Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [232]:
# Bubble Scatterplot of the Retail industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Retail')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Retail Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [235]:
# Bubble Scatterplot of the Consumer industry showing layoff timeline with the larger bubble equal to more layoffs 
# and the y axis showing the percent of the company that was laid off

fig = px.scatter(df2[df2['industry'].str.contains('Consumer')], x="date",  y="percentage_laid_off",
                 size="total_laid_off",
                 template = 'seaborn',
                 labels={
                     "date": "",
                     "percentage_laid_off": "Percent of Company Laid Off"
                 },
                                 size_max=50,
                title="Layoffs in the Consumer Industry from 2020 until 2023")
fig.update_xaxes(dtick="M11",)
fig.layout.yaxis.tickformat = ',.0%'
fig.layout.update(showlegend=False)
fig.show()

In [None]:
# delete all Non US rows
df_US = df.copy()
NonUS = df_US[df_US['country'] != 'United States'].index
df_US.drop(NonUS , inplace=True)
df_US

In [143]:
pip install geopy

Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 5.4 MB/s eta 0:00:01
[?25hCollecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 13.4 MB/s eta 0:00:01
[?25hInstalling collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0
Note: you may need to restart the kernel to use updated packages.


In [174]:
# This takes too much(more than 5 minutes) time.  Need to look into it. 

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="Your_Name")

df_US_cord = (
        df_US.assign(Geocodes= (df_US['location']).apply(geolocator.geocode))
          .assign(Latitude = lambda x: [g.latitude for g in x['Geocodes']],
                  Longitude= lambda x: [g.longitude for g in x['Geocodes']])
          .drop(columns='Geocodes')
      )


In [176]:
# sum total layoffs by city

df_count_US = df_US_cord.groupby(['location', 'Longitude', 'Latitude'])[['company']].nunique().sort_values('company',ascending=False)
df_count_US = df_count_US.reset_index()

In [321]:

df_count_loc = df_US.groupby(['location'])[['company']].nunique().sort_values('company',ascending=False)
df_count_loc = df_count_loc.reset_index()


df_count_US['text'] = df_count_US['location'] + '<br>' +df_count_US['company'].astype(str)  #+ (df_cities['pop']/1e6).astype(str)+' million'
limits = [(0,3),(4,10),(11,20),(21,50),(51,400)]
colors = ["royalblue","brown","lightseagreen","orange","palevioletred"]
cities = []
scale = .2

fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_count_US[df_count_US.company.between(lim[0], lim[1])]
#     df_sub = df_count_US[lim[0]:lim[1]]
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['Longitude'],
        lat = df_sub['Latitude'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['company']/scale,
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'Companies Announcing Layoffs (2020-2023)',
        title_x=0.5,
        showlegend = True,
        legend_title_text = '<b>Amount of Companies</b>',
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )
# fig.update_layout(title_x=0.5)
fig.show()

In [324]:
# Lets look at the 2021 layoffs by industry



fig = px.bar(df_total[(df_total['year'] == '2021')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [326]:
# Lets look at the 2020 layoffs by industry

fig = px.bar(df_total[(df_total['year'] == '2020')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [329]:
# Lets look at the 2021 layoffs by industry



fig = px.bar(df_total[(df_total['year'] == '2021')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [327]:

fig = px.bar(df_total[(df_total['year'] == '2022')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [328]:
# Lets look at the 2023 layoffs by industry


fig = px.bar(df_total[(df_total['year'] == '2023')].sort_values(by=['total_laid_off'],
             ascending=False), 
             x="industry",
             y="total_laid_off",
             color='year', barmode='group',
             height=700,
            category_orders = {'year': ['2020', '2022']},
             template = 'ggplot2',
             title="Layoffs by Year and Industry",
             labels={'industry': 'Industry', 'total_laid_off':'Total Layoffs'})
fig.update_layout(title_x=0.5, xaxis={'categoryorder': 'total descending'})
fig.show()

In [331]:
# look at the number companies in each industry that had layoffs
df_layoffs_stage = df.groupby('stage')[['company']].nunique().sort_values('company',ascending=False)
df_layoffs_stage = df_layoffs_stage.reset_index()
fig = px.bar(df_layoffs_stage,x='stage', y='company',
            title='Number of Companies with Layoffs by Stage',
            template = 'seaborn',
            labels={'company': 'Amount of Companies', 'industry':'Industry'})
fig.update_traces( textposition='outside')