In [1]:
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

from plotly.offline import iplot

cufflinks.go_offline()

# Set global theme
cufflinks.set_config_file(world_readable=True, theme="pearl")

In [2]:

df = pd.read_csv("graduates.csv")
previous_rows, previous_columns = df.shape
print(df.shape)
df.head()

(173, 19)


Unnamed: 0,Rank,Major_code,Major,Total,Major_category,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,1,2419,PETROLEUM ENGINEERING,2339.0,Engineering,0.120564,36,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,Engineering,0.101852,7,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50
2,3,2415,METALLURGICAL ENGINEERING,856.0,Engineering,0.153037,3,648,558,133,340,16,0.024096,73000,50000,105000,456,176,0
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,Engineering,0.107313,16,758,1069,150,692,40,0.050125,70000,43000,80000,529,102,0
4,5,2405,CHEMICAL ENGINEERING,32260.0,Engineering,0.341631,289,25694,23170,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972


In [3]:
df = df.dropna()
rows, columns = df.shape

print(f"{previous_rows - rows} row(s) dropped in dropna() step.")


1 row(s) dropped in dropna() step.


In [5]:
#Lägg till kolumnen ”ShareMen” som anger andelen män som jobbar på varje Major. (stödfunktion för apply)
def share_men(share):

    return 1 - share

#Lägg till antal män och kvinnor  som jobbar i respektive Major och se till att avrunda till integer. (stödfunktion för apply)
def amount_people(total, share, row):
    try:
        return int(total*share)
    except:
        print(f"NaN-values in following row: {list(row)}")
        return 0

In [6]:
#Lägg till kolumnen ”ShareMen” som anger andelen män som jobbar på varje Major.
df["ShareMen"] = df["ShareWomen"].apply(share_men)
df["Amount Men"] = df.apply(lambda row: amount_people(row['Total'], row['ShareMen'], row), axis=1)
df["Amount Women"] = df.apply(lambda row: amount_people(row['Total'], row['ShareWomen'], row), axis=1)

df.head()


Unnamed: 0,Rank,Major_code,Major,Total,Major_category,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs,ShareMen,Amount Men,Amount Women
0,1,2419,PETROLEUM ENGINEERING,2339.0,Engineering,0.120564,36,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193,0.879436,2056,282
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,Engineering,0.101852,7,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50,0.898148,678,77
2,3,2415,METALLURGICAL ENGINEERING,856.0,Engineering,0.153037,3,648,558,133,340,16,0.024096,73000,50000,105000,456,176,0,0.846963,725,130
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,Engineering,0.107313,16,758,1069,150,692,40,0.050125,70000,43000,80000,529,102,0,0.892687,1122,135
4,5,2405,CHEMICAL ENGINEERING,32260.0,Engineering,0.341631,289,25694,23170,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972,0.658369,21239,11020


In [7]:
df.groupby("Major_category").sum()["Amount Men"].iplot(
    kind="bar",
    yTitle="Amount Men",
    linecolor="black",
    title="Amount Men",
)

In [14]:
#Plotta i histogram – först de 6 största, sedan sorterat på average av ShareWomen.

df.groupby("Major_category").mean()[["ShareWomen", 'ShareMen']].iplot(
    kind="bar",
    barmode = 'overlay',
    yTitle="Proportion Women",
    linecolor="black",
    title="Proportion of Women per Category",
)

In [10]:
#Plotta i histogram – först de 6 största, sedan sorterat på average av ShareWomen.
df.groupby("Major_category").mean()["ShareWomen"].sort_values(ascending = True).iplot(
    kind="bar",
    yTitle="Proportion Women",
    linecolor="black",
    title="Proportion of Women per Category",
)

In [11]:
df.groupby("Major_category").mean()["ShareWomen"].nlargest(4).iplot(
    kind="bar",
    yTitle="Number of Articles",
    linecolor="black",
    title="Articles by Publication",
)

In [17]:
tds = df[df["Major_category"] == "Engineering"]
#Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
tds.iplot(
    x="Unemployment_rate",
    y="Full_time_year_round",
    xTitle="Unemployment rate",
    yTitle="Amount of full time year round employees",
    text="Major",
    mode="markers",
    title="Job Prospects Engineering",
)

tds = df[df["Major_category"] == "Business"]
#Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
tds.iplot(
    x="Unemployment_rate",
    y="Full_time_year_round",
    xTitle="Unemployment rate",
    yTitle="Amount of full time year round employees",
    text="Major",
    mode="markers",
    title="Job Prospects Business",
)

In [18]:
df.iplot(
    x="Unemployment_rate",
    y="Full_time_year_round",
    categories="Major_category",
    xTitle="Unemployment rate",
    yTitle="Amount of full time year round employees",
    text="Major",
    mode="markers",
    title="Job Prospects",
)


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead.



In [28]:
df.iplot(
    x="Unemployment_rate",
    y="Full_time_year_round",
    categories="Major_category",
    size = 0.0002*df['Total'],
    #size = 1.5*np.log(df["Total"]),
    xTitle="Unemployment rate",
    yTitle="Amount of full time year round employees",
    text="Major",
    mode="markers",
    title="Job Prospects",
)

In [30]:
import plotly.figure_factory as ff
corrs = df.corr()

figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    colorscale="Earth",
    annotation_text=corrs.round(2).values,
    showscale=True,
    reversescale=True,
)

figure.layout.margin = dict(l=200, t=200)
figure.layout.height = 800
figure.layout.width = 1000

iplot(figure)

In [35]:
def shortener(text):
    return text[:4]
df['Majort_cat_short'] = df['Major_category'].apply(shortener)
# df["Short"] = df.apply(lambda row: shortener(row["Major_category"]),axis = 1)
figure = ff.create_scatterplotmatrix(
    df[["Majort_cat_short", "Median","Unemployment_rate"]],
    height=1000,
    width=1000,
    text=df["Major_category"],
    diag="histogram"
)
iplot(figure)
df.pop("Majort_cat_short")
# print(df.head())

0      Engi
1      Engi
2      Engi
3      Engi
4      Engi
       ... 
168    Biol
169    Psyc
170    Psyc
171    Psyc
172    Educ
Name: Majort_cat_short, Length: 172, dtype: object

In [37]:
#print(df.head())
#Rank,Major_code,Major,Total,Major_category,ShareWomen,Sample_size,Employed,
# #Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
data = [
    go.Scatter(
        x=grouped["P75th"],#grouped["word_count"],
        y=grouped["P25th"],#["reads"]
        text=grouped["Major"],#"title"]
        mode="markers",
        name=name,
        marker=dict(
            symbol=i,
            sizemin=2,
            sizeref=2,
            size=300*grouped["Unemployment_rate"],#["read_ratio"],
            colorscale="BlueRed",
            showscale=True,
            color=df["Sample_size"],#["read_time"]
            line=dict(color="black", width=1.2),
        ),
    )
    for i, (name, grouped) in enumerate(df.groupby("Major_category"))#groupby("publication")
]

figure = go.Figure(
    data=data,
    layout=go.Layout(
        legend=dict(x=1.2),
        width=1200,
        height = 1200,
        margin=dict(r=20),
        xaxis=dict(type="log", title="75th Wage Percentile"),
        yaxis=dict(title="25th Wage Percentile"),
        title="P25th vs P75th wage percentile Colored by Sample size, Sized by Unemployment Rate, and Shaped by Major Category",
    ),
)

iplot(figure)

[Scatter({
     'marker': {'color': array([  36,    7,    3,   16,  289,   17,   51,   10, 1029,  631,  399,  147,
                                  79,   22,   30,   55,  183,  425,   26,   14, 1196,   97,   22,  278,
                                 565,  295,  156,  118,   55,   26,   39,   30,    5, 2554, 2189, 1322,
                                 199,   73,   31, 2042,  541,  425,  142,  190,  158,   37,   45,    3,
                                  36,   29,   67,    7,   43,   25,    4,  219, 2380,  362,  260,   38,
                                  90,  244,  273,  158,  125,   71,   11,   28,   22,   24,   44,   10,
                                   4,  353,   37, 4212, 2684, 1387,  179,   62,  103,  174,    9,   97,
                                  78,  264,   92,   81,   46,   18,    8,  225, 2394, 1728, 1186,  843,
                                 427,  681,  249,  246,   99,  214,  118,  184,  208,  180,  152,   53,
                                 128,   32,   48,    