In [None]:
import textwrap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy
from matplotlib import pyplot, transforms
from matplotlib.text import OffsetFrom

In [None]:
df = pd.read_csv('./data/kaggle_survey_2022_responses.csv', skiprows=[0])

In [None]:
col_mapping = pd.read_csv('./data/column_mapping.csv')

In [None]:
col_mapping_dict = {k:v for k, v in col_mapping.dropna().values}

In [None]:
df = df[col_mapping_dict.keys()].rename(columns=col_mapping_dict)

In [None]:
df

In [None]:
df = df.query('student == "No"')

In [None]:
#Drop the column student after filtering to focus on industry insights from professionals
df = df.drop(columns=['student'])
df

In [None]:
df['title'].unique()

In [None]:
df = df.query('title.notna() and title != "Currently not employed"')


In [None]:
df['title'].unique()

In [None]:
df

In [None]:
df = df.query('industry.notna()')
df

In [None]:
#Plotting the proportions of country 
num_country = df['country'].value_counts()
num_country = num_country.reset_index()
num_country


## 1. Gender & Title distribution in different industries

In [None]:
position = df['title'].value_counts()
position = position.reset_index()
position

In [None]:
position['Group'] = position['index'].where(position['title'] > 100, 'Other')
position

In [None]:
group_position = position.groupby('Group')['title'].sum().reset_index().sort_values(by='title', ascending=False)
group_position



In [None]:
def plot_positions(group_position, ax):
    labels = group_position['Group']
    sizes = group_position['title']
    colors = ['#A6ABAD','#00587A', '#0073A1', '#00A1E0','#00BCE3','#87CEEB', '#89BCC4', '#9BD3DD', '#A4E0EB']

    
    patches, labels_, percentages = ax.pie(
        sizes, colors=colors,
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'},
        textprops={'color': 'white', 'fontweight': 'bold','fontname': "Sans Serif"},
        startangle=90, frame=True,
        autopct="%.2f%%",
        pctdistance=0.85,
    )

    ax.axis('off')

    ax.add_artist(plt.Circle((0, 0), 0.6, color='white', linewidth=0))

    # Adding Title of chart
    ax.set_title('Popular positions from surveyors', fontweight = 'bold', size = 15, fontfamily='serif', ha="center", color="#4d4d4d")

    ax.legend(labels, loc='upper right', bbox_to_anchor=(1.35, 0.75))


In [None]:
fig, axs = plt.subplots(figsize=(15,9), dpi=400)
plot_positions(group_position, axs)
plt.show()

## Gender and Title distribution across and within industry

In [None]:
df['count'] = 1

In [None]:
position_order = df['title'].unique().tolist()
position_order

In [None]:
industry_order = df['industry'].unique().tolist()
industry_order

In [None]:
data_q5q15 = pd.pivot_table(df, values='count', index=['title'], columns=['industry'], aggfunc=np.sum).fillna(0).astype(int).loc[position_order, industry_order].stack()
data_q5q15_man = pd.pivot_table(df[df['gender']=='Man'], values='count', index=['title'], columns=['industry'], aggfunc=np.sum).fillna(0).astype(int).loc[position_order, industry_order].stack()
data_q5q15_woman = pd.pivot_table(df[df['gender']=='Woman'], values='count', index=['title'], columns=['industry'], aggfunc=np.sum).fillna(0).astype(int).loc[position_order, industry_order].stack()

In [None]:
def drawPieMarker(xs, ys, ratios, sizes, colors, ax):
    markers = []
    previous = 0
    # calculate the points of the pie pieces
    for color, ratio in zip(colors, ratios):
        this = 2 * np.pi * ratio + previous
        x  = [0] + np.cos(np.linspace(previous, this, 30)).tolist() + [0]
        y  = [0] + np.sin(np.linspace(previous, this, 30)).tolist() + [0]
        xy = np.column_stack([x, y])
        previous = this
        markers.append({'marker':xy, 's':np.abs(xy).max()**2*np.array(sizes), 'facecolor':color})

    # scatter each of the pie pieces to create pies
    for marker in markers:
        ax.scatter(xs, ys, **marker, alpha=0.7)

In [None]:
# Define function for marker
# def count_to_size(val):
#     max_sz = 20
#     min_sz = 3
#     return np.sqrt((val - 1) / (80 - 1)) * (max_sz - min_sz) + min_sz

In [None]:
fig = plt.figure(figsize=(22, 25), dpi=200)

gs = fig.add_gridspec(5, 5)


ax_plot = fig.add_subplot(gs[1:4, 0:4]) 
for q5_idx in position_order[::-1]:
    for q15_idx in industry_order:
        man = data_q5q15_man[q5_idx][q15_idx]
        woman = data_q5q15_woman[q5_idx][q15_idx]
        tot = data_q5q15[q5_idx][q15_idx]
        drawPieMarker([q15_idx],[q5_idx], [man/(man+woman), woman/(man+woman)], [tot*7], ['#004c70', '#990000'], ax=ax_plot)

ax_plot.grid(linewidth=0.2, zorder=0)        

ax_plot.tick_params(axis='x', labelrotation=90)

# Industry
ax_int = fig.add_subplot(gs[0, :4], sharex=ax_plot) 
data_q15_woman = df[df['gender']=='Woman']['industry'].value_counts()[industry_order]
ax_int.bar(data_q15_woman.index, data_q15_woman, width=0.45, alpha=0.7, color='#990000')

data_q15_man = df[df['gender']=='Man']['industry'].value_counts()[industry_order]
ax_int.bar(data_q15_man.index, data_q15_man, bottom=data_q15_woman , width=0.45, alpha=0.7, color='#004c70')

plt.setp(ax_int.get_xticklabels(), visible=False)


# Title
ax_tit = fig.add_subplot(gs[1:4, 4], sharey=ax_plot) 

data_q5_woman = df[df['gender']=='Woman']['title'].value_counts()[position_order]
ax_tit.barh(data_q5_woman.index[::-1], data_q5_woman[::-1], height=0.55, alpha=0.7, color='#990000')

data_q5_man = df[df['gender']=='Man']['title'].value_counts()[position_order]
ax_tit.barh(data_q5_man.index[::-1], data_q5_man[::-1], left= data_q5_woman[::-1],height=0.55, alpha=0.7, color='#004c70')

plt.setp(ax_tit.get_yticklabels(), visible=False)

# Spines
for s in ['top', 'left', 'right', 'bottom']:
    ax_plot.spines[s].set_visible(False)
    ax_int.spines[s].set_visible(False)
    ax_tit.spines[s].set_visible(False)
    

fig.text(0.6, 0.9, 'Gender & Title distribution by Industry', fontweight='bold', fontfamily='serif', fontsize=35, ha='right') 
fig.text(0.6, 0.88, 'Source: Data Professionals - Kaggle Survey 2022', fontweight='light', style= 'italic', fontfamily='serif', fontsize=15, ha='right')


an1 = ax_int.annotate("Male", xy=(14, 2000), xycoords="data",
                  va="center", ha="center",
                  fontweight = 'bold', fontfamily='serif', fontsize=18,
                  bbox=dict(boxstyle="round", fc="w"), color='#004c70')

offset_from = OffsetFrom(an1, (0.5, -0.2))
an2 = ax_int.annotate("Female", xy=(0.1, 0.1), xycoords="data",
                  xytext=(0, -10), textcoords=offset_from,
                  # xytext is offset points from "xy=(0.5, 0), xycoords=an1"
                  va="top", ha="center", color='#990000',
                  fontweight = 'bold', fontfamily='serif',
                  fontsize=18,
                  bbox=dict(boxstyle="round", fc="w"))
                  
# # legend for marker size
# plt.text(5,10, "Count of Professionals", fontname="Sans Serif", fontsize=8, ha="center")
# marker_x = [10.6, 11, 11.4, 11.9]
# marker_count = [1, 100, 200, 500]
# marker_size = count_to_size(np.array(marker_count))
# for i in range(len(marker_x)):
#     plt.plot(marker_x[i], 54, markersize=marker_size[i], marker="o", color="#4d4d4d")
#     plt.text(marker_x[i], 49, str(marker_count[i]), ha="center", fontsize=8, color="#4d4d4d")
# ax_int.add_patch(plt.Rectangle((5,12), 2, 110, fill=False))

plt.show()



## 2. Education levels in different titles