In [2]:
#Data Preprocessing
import pandas as pd

# Load the data
# Load the data
file_path = '/content/CollegeScoreCard_19-20 (1).xlsx'
data = pd.read_excel(file_path, sheet_name='CollegeScoreCard_19-20')

# Select only relevant columns (numerical and categorical)
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Drop rows with missing values (or handle missing values separately)
cleaned_data = data.dropna()

# Standardize numerical data (optional, for dimensionality reduction techniques)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_data = pd.DataFrame(scaler.fit_transform(cleaned_data[numerical_columns]),
                                 columns=numerical_columns)

# Encode categorical columns if needed (e.g., School Type)
cleaned_data_encoded = pd.get_dummies(cleaned_data, columns=categorical_columns, drop_first=True)


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.express as px

# Assuming the cleaned_data is already loaded into 'cleaned_data'

# Select the relevant columns for dimensionality reduction
columns_for_tsne = ['Average Net Price Tuition', '6-Yr Grad Rate', 'Endowment in 2020',
                    'Percent of Pell-Grant', 'Tuition Out-of-State', 'Undergrad Enrollment']

# Select only the relevant columns and drop rows with missing values
data_tsne = cleaned_data[columns_for_tsne].dropna()

# Normalize the data (standardize) before applying t-SNE
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_tsne)

# Apply t-SNE to reduce the data to 3 dimensions
tsne = TSNE(n_components=3, random_state=42, perplexity=50, n_iter=500)
data_tsne_3d = tsne.fit_transform(data_scaled)

# Reset the index of cleaned_data and tsne_df to avoid any misalignment
cleaned_data_reset = cleaned_data.reset_index(drop=True)
tsne_df = pd.DataFrame(data_tsne_3d, columns=['Dim 1', 'Dim 2', 'Dim 3'])

# Add the relevant columns from the original data to tsne_df
tsne_df['INSTNM'] = cleaned_data_reset['INSTNM']
tsne_df['STABBR'] = cleaned_data_reset['STABBR']

# Grouping the data by '6-Yr Grad Rate' for similarity-based coloring
tsne_df['Grad Rate Group'] = pd.cut(tsne_df['Dim 3'], bins=5, labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])

# Create the interactive 3D scatter plot using Plotly
fig = px.scatter_3d(tsne_df,
                    x='Dim 1',
                    y='Dim 2',
                    z='Dim 3',
                    color='Grad Rate Group',  # Color by Graduation Rate Group
                    size_max=10,
                    opacity=0.8,
                    hover_name='INSTNM',  # Hover over to show the institution name
                    hover_data=['STABBR'],  # Show state abbreviation on hover
                    title='3D t-SNE Visualization of College Data with Gestalt Principles',
                    labels={'Dim 1': 't-SNE Dimension 1',
                            'Dim 2': 't-SNE Dimension 2',
                            'Dim 3': 't-SNE Dimension 3'})

# Customize layout
fig.update_layout(scene=dict(
                    xaxis_title='t-SNE Dimension 1',
                    yaxis_title='t-SNE Dimension 2',
                    zaxis_title='t-SNE Dimension 3'),
                  margin=dict(l=0, r=0, b=0, t=40))

# Apply Gestalt Principle: Proximity & Similarity - Adjust color and size
fig.update_traces(marker=dict(size=5, opacity=0.6, line=dict(width=0.5, color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# Show the plot
fig.show()



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [7]:
import pandas as pd
import plotly.express as px
import folium
from folium.plugins import MarkerCluster

# Load the data
file_path = '/content/CollegeScoreCard_19-20 (1).xlsx'
df = pd.read_excel(file_path, sheet_name='CollegeScoreCard_19-20')

# Filter relevant columns for the story
df_filtered = df[['INSTNM', 'CITY', 'STABBR', 'LATITUDE', 'LONGITUDE',
                  '4-Yr Grad Rate', '6-Yr Grad Rate',
                  'Men Enrolled-Ungergrad', 'Women Enrolled-Undergrad',
                  'Endowment in 2020']]

# Handle missing values
df_filtered = df_filtered.dropna()

# 1. Interactive Map with Folium
def create_interactive_map(data):
    # Create a base map
    college_map = folium.Map(location=[64.2008, -149.4937], zoom_start=5)

    # Add markers with MarkerCluster
    marker_cluster = MarkerCluster().add_to(college_map)
    for _, row in data.iterrows():
        popup_info = f"<b>{row['INSTNM']}</b><br>City: {row['CITY']}<br>4-Yr Grad Rate: {row['4-Yr Grad Rate']}<br>6-Yr Grad Rate: {row['6-Yr Grad Rate']}"
        folium.Marker(
            location=[row['LATITUDE'], row['LONGITUDE']],
            popup=popup_info,
            tooltip=row['INSTNM']
        ).add_to(marker_cluster)

    return college_map

# Create and display the map
interactive_map = create_interactive_map(df_filtered)
interactive_map.save("interactive_college_map.html")

