In [20]:
%run ./setup_notebook.ipynb

In [2]:
import numpy as np # linear algebra
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
# import ipywidgets as widgets
import re
from ipywidgets import interact, interact_manual
# from jupyter_dash import JupyterDash

<div style="font-family:'Trebuchet MS', 'Impact', sans-serif; 
            margin-bottom: 20px; letter-spacing: 6px; text-align:center;
            font-size: 200%;"><b>University of Toronto ECE2T2 Class Profile</b></div>

<div style="font-size: 16px;text-align:center;"><i>Number of Respondents: 52</i></div>

<p style="text-align:start;">Welcome to the University of Toronto ECE2T2 Class Profile -- Inspired by Waterloo Software Engineering 2021 Class Profile. 
We come here to give you all the deets, all the information on our graduating class: their passions, compassions, and the future they have built in their times at UofT.
   
Let's find out about who these 52 people of University of Toronto ECE2T2 are, and where they be at!</p>

In [3]:
df = pd.read_csv("Electrical_and_Computer_Engineering_2023_-_Class_Profile_Survey_Submissions_2023-03-29.csv")

In [4]:
print(len(df))
df.head(5)

52


Unnamed: 0,Submission ID,Respondent ID,Submitted at,What program were you in first year?,What program are you in now?,What 2 areas did you specialize in?,What 2 areas did you specialize in? (Area 1: Photonics & Semiconductor Physics),What 2 areas did you specialize in? (Area 2: Electromagnetics & Energy Systems),What 2 areas did you specialize in? (Area 3: Digital & Analog Electronics),"What 2 areas did you specialize in? (Area 4: Systems Control, Communications & Signal Processing)",...,Which of the following have you done during university? (Kissed someone romantically),Which of the following have you done during university? (Been in a committed relationship),Which of the following have you done during university? (Been in a long distance relationship),Which of the following have you done during university? (Had 'friends with benefits'),Which of the following have you done during university? (Used a dating app),List the recreational controlled substances you have used during university,Untitled long answer field (1),Untitled long answer field (2),Untitled long answer field (3),How many sexual partners have you had
0,qaRxpG,VpyqOy,2023-02-21 01:01:21,Computer,Computer,"Area 5: Computer Hardware & Computer Networks,...",False,False,False,False,...,False,False,False,False,True,,Interesting to see myself - as a person who kn...,Maybe not doing enough design teams and not sp...,Hope for the best for everyone 🙂,0.0
1,07e206,J1YGy4,2023-02-21 01:10:02,Electrical,Electrical,"Area 6: Computer Software, Area 4: Systems Con...",False,False,False,True,...,,,,,,Cannabis,The pandemic allowed me to realize I did not h...,I wish I had learned to focus on my own person...,Do not lose the person that you are in spite o...,0.0
2,E5xj2L,kbxgVM,2023-02-21 01:18:09,TrackOne,Computer,"Area 5: Computer Hardware & Computer Networks,...",False,False,False,False,...,,,,,,Cannabis,,,Live long and prosper,0.0
3,PDOjb0,9N5avV,2023-02-21 01:53:57,TrackOne,Electrical,"Area 1: Photonics & Semiconductor Physics, Are...",True,False,False,True,...,,,,,,,Celebrating my birthday with my U of T friends:),Not playing intramurals and going to more soci...,Find joy in simple things:),
4,9XQGoK,6D59QJ,2023-02-21 01:55:39,TrackOne,Computer,"Area 6: Computer Software, Area 5: Computer Ha...",False,False,False,False,...,,,,,,,,,,


## Data Cleaning



In [9]:
## Write out columns to a text file for analysis
with open("all_columns.txt", mode="w", encoding="utf-8") as f:
    f.write("\n".join(df.columns))

In [10]:
# Modify column data, if they still exist
if {"Untitled long answer field (1)", "Untitled long answer field (2)", "Untitled long answer field (3)"}.issubset(df.columns):
  df.rename(columns={
                      "Untitled long answer field (1)" : "1. Share a story, happy or sad, from your time in ECE",
                      "Untitled long answer field (2)" : "2. What is something you regret over your time at UofT?",
                      "Untitled long answer field (3)" : "3. Give a piece of advice to your fellow ECE 2T2 - 2023 grads"
                    }, inplace=True)

  # Check column data has indeed been changed
  index = df.columns.str.contains("1. Share a story, happy or sad, from your time in ECE") | \
  df.columns.str.contains("2. What is something you regret over your time at UofT?") | \
  df.columns.str.contains("3. Give a piece of advice to your fellow ECE 2T2 - 2023 grads")

  indices = df.columns[index]
  assert len(indices) == 3

In [11]:
# Drop unnecessary article. Non-useful data.
# Proof that all data under this column are null
if "https://money.usnews.com/money/personal-finance/family-finance/articles/where-do-i-fall-in-the-ameri" + \
          "can-economic-class-system" in df.columns:
        assert df["https://money.usnews.com/money/personal-finance/family-finance/articles/where-do-i-fall-in-the-ameri"
                "can-economic-class-system"].isna().sum() == len(df)

        df.drop("https://money.usnews.com/money/personal-finance/family-finance/articles/where-do-i-fall-in-the-american-economic-class-system", 
                axis=1, inplace=True)

## Data Exploration

# Class Profile

## What Program Were You in First Year?

In [12]:
# TO-DO! Make trace subplots with this
fig = px.pie(df, names="What program were you in first year?", color="What program were you in first year?")
fig.update_traces(textinfo="label+percent",
                  hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")
fig.show()

## What Program Are You in Now?

In [13]:
fig = px.pie(df, names="What program are you in now?", color="What program are you in now?")
fig.update_traces(textinfo="label+percent",
                  hovertemplate="<b>%{label}</b><br><i>Count</i>: %{value}")
fig.show()

## What 2 Areas Did You Specialize In?

In [14]:
# Sort specialized areas to remove duplicates. (Ex. Area 6 + Area 5 is equivalent to Area 5 + Area 6)
df["What 2 areas did you specialize in?"] = remove_redundancy(df["What 2 areas did you specialize in?"]) 

In [15]:
df_specialization = df.loc[:, ["What 2 areas did you specialize in?"]]
df_specialization.sort_values("What 2 areas did you specialize in?", ascending=False, inplace=True)

# hover text
areas_long_form = df_specialization["What 2 areas did you specialize in?"].str.split(", ").map(lambda x: " + ".join(x)).tolist()

# Simplified areas
areas_simplified = df_specialization["What 2 areas did you specialize in?"].map(
    lambda word: " + ".join(re.findall('(Area.*?):', word))
)

df_specialization["What 2 areas did you specialize in?"] = areas_simplified




In [16]:
# df.groupby("What 2 areas did you specialize in?", as_index=False).count()[["What 2 areas did you specialize in?", "Submission ID"]]
fig = px.pie(df_specialization, names="What 2 areas did you specialize in?", color="What 2 areas did you specialize in?")
fig.update_traces(textinfo="label+percent",
                  hovertemplate="%{text}<br><i>Count:</i> %{value}<br> %{percent}",
                  hovertext=areas_long_form,
                  text=areas_simplified)
fig.update_layout(legend_traceorder="reversed")
fig.show()

## What is Your Gender?

In [23]:
# data_checker(df["Are you a domestic or international student?"], check_unique=True)

In [24]:
df["Are you a domestic or international student?"].fillna("Did not wish to disclose", inplace=True)

In [25]:
# df["Are you a domestic or international student?"].unique()

In [26]:
fig = create_figure_with_dropdown(df)
fig.show()

In [27]:
fig = px.histogram(df, x="What is your gender?", color="What is your gender?")
fig.update_layout(xaxis_title="")
fig.update_traces(hovertemplate="%{x}=%{y}")
fig.show()

## Are You a Domestic or International Student?

In [None]:
fig = px.histogram(df, x="Are you a domestic or international student?", color="Are you a domestic or international student?")
fig.update_layout(xaxis_title="")
fig.update_traces(hovertemplate="%{x}=%{y}")
fig.show()

## Where did you attend high school?

In [None]:
# df["Where did you attend high school?"].unique()

In [None]:
df_high_school = pd.DataFrame(df["Where did you attend high school?"].value_counts()).reset_index()
# df_high_school.head(5)

In [None]:
df_traffic = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')
df_traffic['text'] = df_traffic['airport'] + '' + df_traffic['city'] + ', ' + df_traffic['state'] + '' + 'Arrivals: ' + df_traffic['cnt'].astype(str)

fig = go.Figure(data=go.Scattergeo(
        lon = df_traffic['long'],
        lat = df_traffic['lat'],
        text = df_traffic['text'],
        mode = 'markers',
        marker_color = df_traffic['cnt'],
        ))


fig.update_geos(projection_type="orthographic")
fig.update_layout(height=600,
                  title = 'Most trafficked US airports<br>(Hover for airport names)',
                  )
# fig.update_layout(

# )
fig.show()

## Top 5 Questions You Did Not Want to Answer

In [None]:
num_nulls = df.isna().sum()

# df_top_5_values_no_answers = num_nulls.sort_values(ascending=False)
df_top_5_values_with_no_answers = pd.DataFrame(num_nulls, columns=["Number of Non-Responses"]).reset_index()
df_top_5_values_with_no_answers.rename({"index" : "Question"}, axis=1, inplace=True)
df_top_5_values_with_no_answers["Percentage of Non-Responses"] = \
    (df_top_5_values_with_no_answers["Number of Non-Responses"] / len(df) * 100).round(2).astype(str) + "%"

df_top_5_values_with_no_answers.sort_values("Number of Non-Responses", ascending=False, inplace=True)
df_top_5_values_with_no_answers = df_top_5_values_with_no_answers.head(5)

In [None]:
# Write out num nulls to a file
obj = zip(num_nulls.index.tolist(), num_nulls.tolist())
nulls_list = list(obj)

with open("nulls.txt", mode="w", encoding="utf-8") as f:
    f.write("\n".join([f"{tup[0]}: {tup[1]}" for tup in nulls_list]))

In [None]:
fig = go.Figure(
    data=[go.Table(
        header=dict(values=["Question", "No Response", "Percentage of Non-Responses"],
                    align="left"),
        cells=dict(values=[df_top_5_values_with_no_answers.Question,
                       df_top_5_values_with_no_answers["Number of Non-Responses"],
                       df_top_5_values_with_no_answers["Percentage of Non-Responses"]],
                       align='left'))],
)
fig.update_layout(height=320, margin_t=20)
fig.show()

# Test/Archive

In [None]:
# a = pd.DataFrame([[True, False, False, False], [False, True, True, False], [True, True, False, False]], columns=[""])
# a.head()

In [None]:
# Correlation Matrix could be fun to unravel some insight
# df.corr()

In [None]:
# DOES NOT WORK! Has to communicate between Python and JavaScript as the interactive plots are being generated:
#
# def generate_profile(x):
#     fig = px.histogram(df, x=x, color=x)
#     _ = fig.update_layout(xaxis_title="")
#     _ = fig.update_traces(hovertemplate="%{x}=%{y}")
#     return fig

# interact(generate_profile, x=["What is your gender?", "Are you a domestic or international student?"])



In [None]:
# from dash import Dash, dcc, html, Input, Output
# app = JupyterDash(__name__)
# app.layout = html.Div([
#     dcc.Dropdown(
#                 ["What is your gender?", "Are you a domestic or international student?"],
#                 "What is your gender?",
#                 id='yaxis-column'
#     ),
#     dcc.Graph(id="output_graph")
# ])

# @app.callback(
#     Output("output_graph", 'figure'),
#     Input("yaxis-column", 'value')
# )
# def get_graph(yaxis_column_name):
#     fig = px.histogram(df, x=yaxis_column_name, color=yaxis_column_name)
#     _ = fig.update_traces(hovertemplate="%{x}=%{y}")
#     return fig

# app.run_server(mode="inline")

In [None]:
# import plotly.graph_objects as go

# df_new = pd.DataFrame(
#     {
#         "Date": ["2020-01-27", "2020-02-27", "2020-03-27"],
#         "A_item": [2, 8, 0],
#         "B_item": [1, 7, 10],
#         "C_item": [9, 2, 9],
#         "Channel_type": ["Channel_1", "Channel_1", "Channel_2"],
#     }
# )

# fig = go.Figure(go.Table(header={"values": df_new.columns}, cells={"values": df_new.T.values}))
# fig.update_layout(
#     updatemenus=[
#         {
#             "buttons": [
#                 {
#                     "label": c,
#                     "method": "update",
#                     "args": [
#                         {
#                             "cells": {
#                                 "values": df_new.T.values
#                                 if c == "All"
#                                 else df_new.loc[df_new["Channel_type"].eq(c)].T.values
#                             }
#                         }
#                     ],
#                 }
#                 for c in ["All"] + df_new["Channel_type"].unique().tolist()
#             ]
#         }
#     ]
# )

In [106]:
# df_new = pd.DataFrame(
#     {
#         "Date": ["2020-01-27", "2020-02-27", "2020-03-27", "2020-01-27"],
#         "A_item": [2, 8, 0, 3],
#         "B_item": [1, 7, 10, 4],
#         "C_item": [9, 2, 9, 5],
#         "Channel_type": ["Channel_1", "Channel_1", "Channel_2", "Channel_2"],
#     }
# )

# fig = go.Figure(go.Table(header={"values": df_new.columns}, cells={"values": df_new.T.values}))
# fig.update_layout(
#     updatemenus=[
#         {
#             "y": 1 - (i / 5),
#             "buttons": [
#                 {
#                     "label": c,
#                     "method": "restyle",
#                     "args": [
#                         {
#                             "cells": {
#                                 "values": df_new.T.values
#                                 if c == "All"
#                                 else df_new.loc[df_new[menu].eq(c)].T.values
#                             }
#                         }
#                     ],
#                 }
#                 for c in ["All"] + df_new[menu].unique().tolist()
#             ],
#         }
#         for i, menu in enumerate(["Channel_type", "Date"])
#     ]
# )

## Sexual Orientation Based on Gender

In [None]:
fig = px.pie(df, names="What is your sexual orientation?", facet_col="What is your gender?", 
                   facet_col_spacing=0.04,
                   color="What is your sexual orientation?")
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_traces(hovertemplate="<i>%{label}</i><br>Count: %{value}")
fig.update_layout(showlegend=False)
fig.show()