# EDA - Bivariate Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import httpimport

from pathlib import Path

In [4]:
# Import personal library
with httpimport.github_repo("junclemente", "jcds", ref="master"):
    import jcds.eda as jq

In [5]:
datasets = Path("../datasets")
full_df = pd.read_csv(datasets / "school_clean.csv")
full_df.head()

Unnamed: 0,Student ID,Undergrad Degree,Undergrad Grade,MBA Grade,Work Experience,Employability (Before),Employability (After),Status,Annual Salary
0,1,Business,68.4,90.2,No,252.0,276.0,Placed,111000.0
1,2,Business,62.1,92.8,No,423.0,410.0,Not Placed,0.0
2,3,Computer Science,70.2,68.7,Yes,101.0,119.0,Placed,107000.0
3,4,Engineering,75.1,80.7,No,288.0,334.0,Not Placed,0.0
4,5,Finance,60.9,74.9,No,248.0,252.0,Not Placed,0.0


Since the goal is to determine which students have the highest chance of being placed, the project will focus on data that would be available during admissions. 

The columns are ['Undergrad Degree', 'Undegrad Grade', 'Work Experience', 'Employability (Before), 'Status'].

In [8]:
columns_for_dataset = ['Undergrad Degree', 'Undegrad Grade', 'Work Experience', 'Employability (Before)', 'Status']
# Create dataframe of target features
df = full_df[columns_for_dataset]

KeyError: "['Undegrad Grade'] not in index"

In [6]:
# Set global color palette
global_color = "colorblind"
# Set global color for Seaborn
sns.set_palette(global_color)
# Set global color palette for Matplotlib
colors = sns.color_palette(global_color)
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors)

## Pair Plot Visualization

In [7]:
sns.pairplot(df)
plt.show()

NameError: name 'df' is not defined

## Work Experience vs Status

In [None]:
crosstab = pd.crosstab(df["Status"], df["Work Experience"])
display(crosstab)

In [None]:
plt.figure(figsize=(8, 5))

# Plot the bars manually for 'No' and 'Yes' work experience
bars1 = plt.bar(
    crosstab.index, crosstab["No"], label="No Work Experience", color="#1f77b4"
)
bars2 = plt.bar(
    crosstab.index,
    crosstab["Yes"],
    bottom=crosstab["No"],
    label="Yes Work Experience",
    color="#ff7f0e",
)

# Add number counts on top of each bar for 'No Work Experience'
for bar in bars1:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2.0,
        height - 30,
        "%d" % int(height),
        ha="center",
        color="white",
        fontweight="bold",
    )

# Add number counts on top of each bar for 'Yes Work Experience'
for bar in bars2:
    height = bar.get_height() + bar.get_y()
    plt.text(
        bar.get_x() + bar.get_width() / 2.0,
        height + 10,
        "%d" % int(height),
        ha="center",
        color="black",
        fontweight="bold",
    )

# Add labels and title
plt.xlabel("Status")
plt.ylabel("Count")
plt.title("Stacked Bar Chart: Work Experience vs Status After Graduation")
plt.legend()

# Show the plot
plt.show()