In [3]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px

In [None]:
majors = pd.read_csv("data/majors.csv")
names = pd.read_csv("data/names.scv")

In [None]:
majors.head(20)

In [None]:
names.head()

In [None]:
names['Name'] = names['Name'].str.lower()

In [None]:
names.head()

In [None]:
first_letter = names['Name'].str[0].value_counts()
first_letter.head()

In [None]:
plt.bar(first_letter.index, first_letter.values)
plt.xlabel('First Letter')
plt.ylabel('Frequence')
plt.title('First Letter Frequency Distribution')
plt.show()

In [None]:
print(len(names))
print(len(majors))

In [None]:
names["Role"].value_counts()

In [None]:
names[names['Name'] == "#ref!"]

In [None]:
names = names[names['Name'] != "#ref!"]

In [None]:
names['Role'].value_counts().to_frame()

In [None]:
names['Name'].value_counts().to_frame()

In [None]:
majors.columns

In [None]:
majors['Terms in Attendance'].value_counts().to_frame()

In [None]:
majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().tp_frame()

In [None]:
names.describe()

In [None]:
majors.describe()

In [None]:
majors_count = (
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False)
    .to_frame()
    .head(20)
)

majors_count

In [None]:
fig = px.bar(majors_count.loc[::-1], orientation='h')
fig.update_layout(showlegend=False,
                  xaxis_title='Count',
                  yaxis_title='Major',
                  autosize=False,
                  width=800,
                  height=500)

In [None]:
fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')

fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class",
                  autosize=False,
                  width=800,
                  height=250)

In [None]:
majors.loc[majors.loc[:, 'Terms in Attendance'] != 'G', 'Terms in Attendance'] = 'Undergraduate'
majors.loc[maojrs.loc[:, 'Terms in Attendance'] == 'G', 'Terms in Attendance'] = 'Graduate'

majors.rename(columns={'Terms in Attendance' : 'Undergrad Grad'}, inplace=True)

majors.describe()

In [None]:
print(majors.columns)
print(names.columns)

In [4]:
url = "https://docs.google.com/spreadsheets/d/1J7tz3GQLs3M6hFseJCE9KhjVhe4vKga8Q2ezu0oG5sQ/gviz/tq?tqx=out:csv"

university_majors = pd.read_csv(url,
                                usecols = ['Academic Yr', 'Semester', 'Ungrad Grad',
                                           'Entry Status', 'Major Short Nm', 'Student Headcount'])

In [5]:
university_majors

Unnamed: 0,Academic Yr,Semester,Ungrad Grad,Entry Status,Major Short Nm,Student Headcount
0,2014-15,Fall,Graduate,Graduate,Education,335
1,2014-15,Fall,Graduate,Graduate,Educational Leadership Jnt Pgm,1
2,2014-15,Fall,Graduate,Graduate,Special Education,18
3,2014-15,Fall,Graduate,Graduate,Science & Math Education,15
4,2014-15,Fall,Graduate,Graduate,Chemical Engineering,136
...,...,...,...,...,...,...
7199,2023-24,Spring,Undergraduate,Transfer Entrant,Nut Sci-Physio & Metabol,13
7200,2023-24,Spring,Undergraduate,Transfer Entrant,Nutritional Sci-Dietetics,1
7201,2023-24,Spring,Undergraduate,Transfer Entrant,Nutritional Sci-Toxicology,2
7202,2023-24,Spring,Undergraduate,Transfer Entrant,Genetics & Plant Biology,11


In [7]:
university_majors = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad', 'Entry Status', 'Major Short Nm'], as_index = False)[["Student Headcount"]]
                     .mean()
                     )

university_majors

Unnamed: 0,Academic Yr,Ungrad Grad,Entry Status,Major Short Nm,Student Headcount
0,2014-15,Graduate,Graduate,African American Studies,30.0
1,2014-15,Graduate,Graduate,Ag & Resource Economics,73.5
2,2014-15,Graduate,Graduate,Anc Hist & Medit Archae,14.0
3,2014-15,Graduate,Graduate,Anthropology,76.5
4,2014-15,Graduate,Graduate,Applied Mathematics,18.5
...,...,...,...,...,...
3697,2023-24,Undergraduate,Transfer Entrant,Spanish and Portuguese,16.5
3698,2023-24,Undergraduate,Transfer Entrant,Statistics,46.0
3699,2023-24,Undergraduate,Transfer Entrant,Sustainable Environ Dsgn,4.0
3700,2023-24,Undergraduate,Transfer Entrant,Theater & Perf Studies,44.0


In [15]:
university_grad_vs_ungrd = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad'], as_index = False)[["Student Headcount"]]
                            .sum()
)

proportions = university_grad_vs_ungrd.pivot(index='Academic Yr', columns='Ungrad Grad', values='Student Headcount')
proportions['Total'] = proportions['Undergraduate'] + proportions['Graduate']
proportions['Undergrad Proportion'] = proportions['Undergraduate'] / proportions['Total']
proportions['Grad Proportion'] = proportions['Graduate'] / proportions['Total']

fig = px.bar(proportions.reset_index(),
             x='Academic Yr',
             y=['Undergraduate', 'Graduate'],
             title='Number of Ggrad vs. Undergrad Students',
             labels={'value': 'Number of Students'},
             color_discrete_map={'Undergraduate':'blue', 'Graduate': 'orange'})

fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

In [None]:
data100_grad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Graduate'].count()

data100_undergrad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Undergraduate'].count()

print("Number of graduate students in Data 100: ", data100_grad)
print("Number of undergraduate students in Data 100: ", data100_undergrad)

In [None]:
data100_row = {'Graduate':[data100_grad],
               'Undergraduate':[data100_undergrad],
               'Total': [data100_grad + data100_undergrad],
               'Undergrad Proportion':[data100_undergrad / (data100_grad + data100_undergrad)],
               'Grad Proportion': [data100_grad / (data100_grad + data100_undergrad)],
               }

new_row_df = pd.DataFrame(data100_row)

proportions.loc['Data 100'] = new_row_df.iloc[0]

fig = px.bar(proportions.reset_index(),
             x='Academic Yr',
             y = ['Undergrad Proportion', 'Grad Proportion'],
             title='Proportions of Grad vs. Undergrad Students',
             labels={'value':'Proportion'},
             color_discrete_map={'Undergrad Proportion': 'blue', 'Grad Proportion': 'orange'})

fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

In [None]:
data100_top_20_majors = (
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False)
    .to_frame()
    .head(20)
)

major_trends = university_majors.groupby(['Academic Yr', 'Major Short Nm'],
                                         as_index = False)[["Student Headcount"]].sum()

print("Top 20 majors at Berkeley in 2022-23")
major_trends[major_trends.loc[:, 'Academic Yr'] == '2022-23'].sort_values('Student Headcount', ascending=False).head(20)

In [None]:
print("Top 20 majors at Berkeley since 2013")
major_trends.groupby(['Major Short Nm'], as_index = False)[['Student Headcount']].sum().sort_values('Student Headcount', ascending=False).head(20)

In [None]:
data100_top_20_majors.index = data100_top_20_majors.index.str.rsplit('', n=1).str[0]
print("Top 20 majors at Berkeley in Data 100")
print(data100_top_20_majors)

In [None]:
fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_20_majors.index)],
              x = "Academic Yr", y= "Student Headcount", color = "Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

In [None]:
data100_top_19_majors = data100_top_19_majors.iloc[1:,:]

fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_19_majors.index)],
              x = "Academic Yr", y="Student Headcount", color="Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()