In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Innomatics/EDA Project/data.xlsx')

In [None]:
data.head(5)

In [None]:
data.tail(5)

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.dtypes

In [None]:
data.drop(columns=['Unnamed: 0', 'ID', 'CollegeID', 'CollegeCityID'], inplace=True)

##**Univariate Analysis**

In [None]:
# Select numerical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
numerical_cols

In [None]:
data[numerical_cols].describe()

In [None]:
data[numerical_cols].mode()

In [None]:
data[numerical_cols].max() - data[numerical_cols].min()

In [None]:
data[numerical_cols].skew()

In [None]:
data[numerical_cols].kurtosis()

In [None]:
# Create subplots for each numerical column with PDFs, histograms, and boxplots
fig, axs = plt.subplots(len(numerical_cols), 3, figsize=(15, len(numerical_cols) * 5))

for i, col in enumerate(numerical_cols):
    # Histogram
    sns.histplot(data[col], kde=False, ax=axs[i][0])
    axs[i][0].set_title(f'Histogram of {col}')
    axs[i][0].set_xlabel(col)
    axs[i][0].set_ylabel('Frequency')

    # PDF
    sns.histplot(data[col], kde=True, ax=axs[i][1])
    axs[i][1].set_title(f'PDF of {col}')
    axs[i][1].set_xlabel(col)
    axs[i][1].set_ylabel('Density')

    # Boxplot
    sns.boxplot(data[col], ax=axs[i][2])
    axs[i][2].set_title(f'Boxplot of {col}')
    axs[i][2].set_xlabel(col)
    axs[i][2].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Create subplots for QQ plots and ECDF plots of numerical columns
fig, axs = plt.subplots(len(numerical_cols), 2, figsize=(15, len(numerical_cols) * 5))

for i, col in enumerate(numerical_cols):
    # QQ Plot
    stats.probplot(data[col], dist="norm", plot=axs[i][0])
    axs[i][0].set_title(f'QQ Plot of {col}')
    axs[i][0].set_xlabel('Theoretical Quantiles')
    axs[i][0].set_ylabel('Ordered Values')

    # ECDF Plot
    sorted_values = data[col].sort_values()
    ecdf = pd.Series(range(1, len(sorted_values) + 1), index=sorted_values) / len(sorted_values)
    axs[i][1].plot(ecdf.index, ecdf, marker='.', linestyle='none')
    axs[i][1].set_title(f'ECDF of {col}')
    axs[i][1].set_xlabel(col)
    axs[i][1].set_ylabel('ECDF')

plt.tight_layout()
plt.show()

In [None]:
# Outliers detection using IQR method
for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]

In [None]:
outliers.shape[0]

In [None]:
outliers

In [None]:
fig, axs = plt.subplots(len(numerical_cols), 1, figsize=(10, len(numerical_cols) * 5))

for i, col in enumerate(numerical_cols):
    # Original Data
    sns.histplot(data[col], kde=True, color='blue', ax=axs[i], label='Original Data')

    # Remove outliers
    z_scores = stats.zscore(data[col])
    data_no_outliers = data[(z_scores < 3) & (z_scores > -3)]  # Adjust the threshold as needed

    # Data without outliers
    sns.histplot(data_no_outliers[col], kde=True, color='orange', ax=axs[i], label='Data without Outliers')

    axs[i].set_title(f'Distribution of {col} (with and without outliers)')
    axs[i].set_xlabel(col)
    axs[i].set_ylabel('Density')
    axs[i].legend()

plt.tight_layout()
plt.show()

In [None]:
categorical_cols = data.select_dtypes(include=['object']).columns
categorical_cols

In [None]:
data[categorical_cols].describe()

In [None]:
# Get unique values of each categorical column
for col in categorical_cols:
    unique_values = data[col].unique()
    print(f'Unique values of column {col}: {unique_values}')

In [None]:
# Get unique values and their counts for each categorical column
for col in categorical_cols:
    value_counts = data[col].value_counts()
    print(f'Value counts of column {col}:\n{value_counts}\n')

In [None]:
# Create subplots for frequency distribution of categorical columns
num_plots = len(categorical_cols)
fig, axs = plt.subplots(num_plots, 1, figsize=(15, 35*num_plots))

for i, col in enumerate(categorical_cols):
    sns.countplot(data[col], ax=axs[i])
    axs[i].set_title(f'Frequency distribution of {col}')
    axs[i].set_xlabel(col)
    axs[i].set_ylabel('Frequency')
    axs[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

##**Bivariate Analysis**

In [None]:
# Calculate the Pearson correlation coefficient
correlation_matrix = data[numerical_cols].corr(method='pearson')

print("Pearson correlation coefficient for all numerical columns:")
correlation_matrix

In [None]:
# Visualize pearson correlation matrix as a heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
plt.title('Pearson Correlation Coefficient Heatmap')
plt.show()

In [None]:
# Calculate the Spearman Rank correlation coefficient
spearman_correlation_matrix = data[numerical_cols].corr(method='spearman')

print("Spearman Rank Correlation Coefficient for all numerical columns:")
spearman_correlation_matrix

In [None]:
# Visualize spearman rank correlation matrix as a heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(spearman_correlation_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
plt.title('Spearman Rank Correlation Coefficient Heatmap')
plt.show()

In [None]:
# Bivariate Analysis - Relationships between numerical columns
# Scatter plots
sns.pairplot(data[numerical_cols])
plt.suptitle('Pairplot of Numerical Columns', y=1.02)
plt.show()

In [None]:
# Scatter plots
for col1 in numerical_cols:
    for col2 in numerical_cols:
        if col1 != col2:
            plt.scatter(data[col1], data[col2], alpha=0.5)
            plt.xlabel(col1)
            plt.ylabel(col2)
            plt.title(f'Scatter plot between {col1} and {col2}')
            plt.show()

In [None]:
# Hexbin plots
for col1 in numerical_cols:
    for col2 in numerical_cols:
        if col1 != col2:
            plt.hexbin(data[col1], data[col2], gridsize=20)
            plt.xlabel(col1)
            plt.ylabel(col2)
            plt.title(f'Hexbin plot between {col1} and {col2}')
            plt.colorbar(label='count')
            plt.show()

In [None]:
# Stacked bar plots
for cat_col1 in categorical_cols:
    for cat_col2 in categorical_cols:
        if cat_col1 != cat_col2:
            # Create cross-tabulation
            cross_tab = pd.crosstab(data[cat_col1], data[cat_col2])

            # Plot stacked bar plot
            cross_tab.plot(kind='bar', stacked=True, figsize=(30, 30))
            plt.title(f'Stacked bar plot between {cat_col1} and {cat_col2}')
            plt.xlabel(cat_col1)
            plt.ylabel('Frequency')
            plt.xticks(rotation=90)
            plt.legend(title=cat_col2, loc='upper left', bbox_to_anchor=(1.05, 1))  # Move legend outside plot area
            plt.show()

In [None]:
# Bivariate Analysis - Patterns between categorical and numerical columns
# Swarm plots
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        sns.swarmplot(x=cat_col, y=num_col, data=data)
        plt.title(f'Swarm plot of {num_col} vs {cat_col}')
        plt.xlabel(cat_col)
        plt.ylabel(num_col)
        plt.show()

In [None]:
# Box plots
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        sns.boxplot(x=cat_col, y=num_col, data=data)
        plt.title(f'Box plot of {num_col} vs {cat_col}')
        plt.xlabel(cat_col)
        plt.ylabel(num_col)
        plt.show()

In [None]:
# Bar plots
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        sns.barplot(x=cat_col, y=num_col, data=data)
        plt.title(f'Bar plot of {num_col} vs {cat_col}')
        plt.xlabel(cat_col)
        plt.ylabel(num_col)
        plt.show()