In [None]:
# EDA Project - Engineering Colleges Dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("engineering colleges in India.csv")
df.head()

In [None]:
# Display top 5 and last 5 rows
display(df.head())
display(df.tail())

In [None]:
# Count of independent and dependent variables
dependent_var = 'Rating'
independent_vars = [col for col in df.columns if col != dependent_var]
print("Number of Independent Variables:", len(independent_vars))
print("Number of Dependent Variables:", 1)

In [None]:
# Descriptive statistics
display(df.describe(include='all'))

In [None]:
# Convert numeric columns to numeric dtype
numeric_cols = ['Total Student Enrollments', 'Total Faculty', 'Established Year', 'Average Fees']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Independent variable with minimum average
avg_values = df[numeric_cols].mean()
print("Independent Variable with Minimum Average Value:", avg_values.idxmin())

# Independent variable with highest standard deviation
std_values = df[numeric_cols].std()
print("Independent Variable with Highest Standard Deviation:", std_values.idxmax())

In [None]:
# Total count of missing values in each column
missing_counts = df[independent_vars].isnull().sum()
print("\nMissing Values in Independent Variables:\n", missing_counts)

In [None]:
# Visualize missing values
plt.figure(figsize=(10,6))
sns.heatmap(df[independent_vars].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Variable with max missing values
print("Independent Variable with Maximum Missing Values:", missing_counts.idxmax())

In [None]:
# Replace missing values in a numeric independent variable with average (e.g. 'Average Fees')
df['Average Fees'] = df['Average Fees'].fillna(df['Average Fees'].mean())

In [None]:
# Histogram of an independent variable (e.g. 'Established Year')
plt.figure(figsize=(8,5))
df['Established Year'].dropna().astype(int).hist(bins=30)
plt.title("Frequency Distribution of Established Year")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Box-plot to identify outliers (e.g. 'Average Fees')
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='Average Fees')
plt.title("Boxplot for Average Fees")
plt.show()

In [None]:
# Line chart for correlation (e.g. between 'Established Year' and 'Average Fees')
df_sorted = df.sort_values('Established Year')
plt.figure(figsize=(10,6))
plt.plot(df_sorted['Established Year'], df_sorted['Average Fees'])
plt.title("Line Chart: Established Year vs Average Fees")
plt.xlabel("Established Year")
plt.ylabel("Average Fees")
plt.show()

In [None]:
# Correlation Matrix
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Scatter plots for two pairs
# Assuming high positive correlation: 'Total Faculty' vs 'Total Student Enrollments'
sns.scatterplot(data=df, x='Total Faculty', y='Total Student Enrollments')
plt.title("Scatter Plot - Total Faculty vs Student Enrollments")
plt.show()

In [None]:
# Assuming weak/negative correlation: 'Average Fees' vs 'Rating'
sns.scatterplot(data=df, x='Average Fees', y='Rating')
plt.title("Scatter Plot - Average Fees vs Rating")
plt.show()