In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Load The Dataset

In [None]:
df = pd.read_csv('/kaggle/input/diversity-in-tech-companies/Diversity in tech companies.csv')
df.head(5)

### Data cleaning and type conversion

In [None]:
# Check all columns
print(df.columns)

### Summary statistics and distribution visualization

In [None]:
# Check for missing values
print(df.isnull().sum())

### Trends of diversity metrics across years

In [None]:
# Summary statistics for numerical columns
print(df.describe())

### Comparisons of diversity metrics among companies

In [None]:
# Distribution of each numerical variable
sns.set_theme(style="whitegrid", palette="pink")
df.hist(figsize=(8, 8), color='purple')
plt.tight_layout()
plt.show()

### Overall diversity distribution and correlation analysis

In [None]:
# Distribution of categorical variables
sns.set_palette("pink")
plt.figure(figsize=(12, 6))
sns.countplot(x='Company', data=df)
plt.xticks(rotation=90)
plt.title('Count of Records by Company')
plt.show()

### Ranking companies based on total diversity

In [None]:
# Visualize relationships
sns.pairplot(df, palette="purple")
plt.show()

### Specific analysis for FAANG companies

In [None]:
# Checking data types
print(df.dtypes)

### Visualizing the rise of multiple ethnicities

In [None]:
# Cleaning and converting percentage columns
percentage_columns = ['Female %', 'Male %', '% White', '% Asian', '% Latino', '% Black', '% Multi', '% Other', '% Undeclared']

for col in percentage_columns:
    df[col] = df[col].replace('<', '', regex=True)  # Removing '<' symbols
    df[col] = df[col].replace('>', '', regex=True)  # Removing '>' symbols
    df[col] = df[col].replace('-', '0', regex=True)  # Replacing '-' with '0'
    df[col] = df[col].astype(float).fillna(0)  # Converting to float and filling NaN with 0

df.head(10)

In [None]:
# Group by company and year
grouped = df.groupby(['Company', 'Year']).mean().reset_index()

In [None]:
# Set a color palette
sns.set_palette("pink")

In [None]:
# Plotting trends for each diversity metric
for col in percentage_columns:
    plt.figure(figsize=(12, 6))
    for company in grouped['Company'].unique():
        company_data = grouped[grouped['Company'] == company]
        plt.plot(company_data['Year'], company_data[col], marker='o', label=company)

    plt.title(f'Trend in {col} Across Companies Over Years')
    plt.xlabel('Year')
    plt.ylabel(f'{col}')
    plt.xticks(grouped['Year'].unique()) 
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.grid(True)
    plt.show()

In [None]:
# Company Comparison for female population 
plt.figure(figsize=(12, 6))
sns.barplot(x='Company', y='Female %', data=df, hue='Year')
plt.title('Female % Across Companies')
plt.xlabel('Company')
plt.ylabel('Female %')
plt.xticks(rotation=45)
plt.legend(title='Year')
plt.show()

In [None]:
# Company Comparison for black population 
plt.figure(figsize=(12, 6))
sns.barplot(x='Company', y='% Black', data=df, hue='Year')
plt.title('Black % Across Companies')
plt.xlabel('Company')
plt.ylabel('% Black')
plt.xticks(rotation=45)
plt.legend(title='Year')
plt.show()

### Overall Diversity Distribution
#### Boxplot for Diversity Metrics: Visualized the distribution of all diversity metrics.

In [None]:
# Overall Diversity Distribution
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[percentage_columns])
plt.title('Overall Diversity Distribution')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.show()

### Correlation Analysis
#### Heatmap of Correlation Matrix: Analyzed the correlations between different diversity metrics.

In [None]:
# Correlation Analysis
correlation_matrix = df[percentage_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Diversity Metrics')
plt.show()

In [None]:
# Company Comparison for FAANG companies (Facebook, Amazon, Apple, Netflix, and Google)
selected_companies = ['Amazon','Apple','Netflix','Google']
selected_df = df[df['Company'].isin(selected_companies)]
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Female %', hue='Company', data=selected_df)
plt.title('Trend in Female % Among FAANG Companies (2014-2018)')
plt.xlabel('Year')
plt.ylabel('Female %')
plt.xticks(grouped['Year'].unique())  # Set x-axis ticks to be the unique years
plt.legend(title='Company')
plt.grid(True)
plt.show()

In [None]:
# Melt the DataFrame to have a single 'Ethnicity' column
melted_df = df.melt(id_vars=['Year', 'Company'], var_name='Ethnicity', value_name='Percentage')
# Plotting
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Percentage', hue='Ethnicity', data=melted_df, marker='o')
plt.title('Rise of Multiple Ethnicities in FAANG Companies (2014-2018)')
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.xticks(grouped['Year'].unique())  # Set x-axis ticks to be the unique years
plt.grid(True)
plt.show()

### Company Ranking Based on Total Diversity
#### Total Diversity Calculation: Summed all diversity percentages for each company.
#### Ranking Plot: Visualized the ranking of companies based on total diversity.

In [None]:
df['Total Diversity'] = df[percentage_columns].sum(axis=1)
ranked_df = df.groupby('Company')['Total Diversity'].mean().sort_values(ascending=False).reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='Company', y='Total Diversity', data=ranked_df)
plt.title('Ranking of Companies Based on Total Diversity')
plt.xlabel('Company')
plt.ylabel('Total Diversity')
plt.xticks(rotation=45)
plt.show()

### FAANG Companies Analysis
#### Female % Trend in FAANG Companies: Compared trends among Amazon, Apple, Netflix, and Google.

In [None]:
selected_companies = ['Amazon', 'Apple', 'Netflix', 'Google']
selected_df = df[df['Company'].isin(selected_companies)]
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Female %', hue='Company', data=selected_df)
plt.title('Trend in Female % Among FAANG Companies (2014-2018)')
plt.xlabel('Year')
plt.ylabel('Female %')
plt.xticks(grouped['Year'].unique())
plt.legend(title='Company')
plt.grid(True)
plt.show()

### Ethnicity Analysis
#### Melted DataFrame for Ethnicities: Reshaped the data to analyze multiple ethnicities over the years.
#### Ethnicity Trends in FAANG Companies: Visualized the rise of multiple ethnicities.

In [None]:
melted_df = df.melt(id_vars=['Year', 'Company'], var_name='Ethnicity', value_name='Percentage')
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Percentage', hue='Ethnicity', data=melted_df, marker='o')
plt.title('Rise of Multiple Ethnicities in FAANG Companies (2014-2018)')
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.xticks(grouped['Year'].unique())
plt.grid(True)
plt.show()

### Conclusion

#### This project provides a comprehensive analysis of diversity metrics in tech companies. Through detailed visualizations and analyses, we have highlighted key trends, compared diversity across companies, and explored the overall distribution and correlations among various diversity metrics. This analysis can help stakeholders understand the current state and progression of diversity in the tech industry, guiding future diversity initiatives and policies.