In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_excel("your_file_name.xlsx")

In [None]:
df.head()

In [None]:
df.isnull()

In [None]:
x = df.isna().sum().sum()

In [None]:
x

###   Listing down all the columns with missing values.

In [None]:
missing_cols = df.columns[df.isnull().any()]

In [None]:
missing_cols=missing_cols.to_list()

In [None]:
# List of columns that are null or contain atleast one null value
missing_cols

In [None]:
print(len(missing_cols))

### Cleaning the data (optional since it is not mentioned in the description)(cleaning the data to efficiently find duplicates in further cells)

In [None]:
number_of_null = df.isna().sum()

In [None]:
number_of_null

Dropping the null rows , if we want to preserve the row from being lost while dropping, a better way would be to interploate the values in those rows such that the whole row doesn't get affected because of minimal null values 

In [None]:
df=df.dropna()

In [None]:
df

###   Categorizing the columns based upon their data types

In [None]:
# numeric and object data types
numeric_df = df.select_dtypes(include="number")
object_df = df.select_dtypes(include="object")

In [None]:
numeric_df=numeric_df.columns.to_list()

In [None]:
object_df=object_df.columns.to_list()

In [None]:
numeric_df

In [None]:
object_df

In [None]:
print(len(object_df))
print(len(numeric_df))

In [None]:
# we don't have string and datetime or any other data types except number(114 cols) and object(3 cols)
string_df = df.select_dtypes(include="string")
datetime_df = df.select_dtypes(include="datetime")
excluding_number_object_df = df.select_dtypes(exclude=["number","object"])
print(len(string_df.columns.to_list()))
print(len(datetime_df.columns.to_list()))
print(len(excluding_number_object_df.columns.to_list()))

### Removing Duplicate columns and printing df before and after

In [None]:
# checking for column wise duplication
# cols which are exactly same are cleaned
df_cleaned = df.loc[:, ~df.T.duplicated()]

print("Duplicate columns before removal:", df.columns[df.T.duplicated()].tolist())


In [None]:
print("\nDataFrame before removing duplicate columns:")
df

In [None]:

print("\nDataFrame after removing duplicate columns:")
df_cleaned

### Identifying constant columns and printing df before and after removing them

In [None]:
# constant columns have only one unique value across whole column values
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print("Constant columns : ",constant_cols)

In [None]:
# removing constant cols
cleaned_df = df.drop(columns=constant_cols)

In [None]:
# df before removing constant columns 
df

In [None]:
# df after removing constant columns 
cleaned_df


### Creating box plot to visualise the outliers of all the numeric columns

In [None]:
# selecting the numeric data type and converting to cols
plot_cols = df.select_dtypes(include="number")

In [None]:
plot_cols

 using seaborn and matplotlib

In [None]:
# Melt the df (coverts from wide to long format for easy plotting)
melted_df = plot_cols.melt(var_name="Column", value_name="Value")

# create a plot box
plt.figure(figsize=(30, 40))
sns.boxplot(x="Column", y="Value", data=melted_df)
plt.title("Box Plot of Numeric Columns")
plt.xticks(rotation=90)
plt.show()

The data shown above is too dense and thus not properly visible , so tackle this issue we can plot two graphs by dividing our columns into two halves

In [None]:
# dividing the cols/2 and plotting half the cols as a time so as for better visualization
half = len(plot_cols.columns) // 2
# First half
first_df = plot_cols.iloc[:, :half]
# Second half
second_df = plot_cols.iloc[:, half:]

In [None]:
# melting both dfs from wide to long

first_df = first_df.melt(var_name="Column", value_name="Value")
second_df = second_df.melt(var_name="Column", value_name="Value")

In [None]:
# Plot the first half
plt.figure(figsize=(14, 15))
sns.boxplot(x="Column", y="Value", data=first_df)
plt.title("Box Plot of First Half of Numeric Columns")
plt.xticks(rotation=90)
plt.show()


# Plot the second half
plt.figure(figsize=(14, 15))
sns.boxplot(x="Column", y="Value", data=second_df)
plt.title("Box Plot of Second Half of Numeric Columns")
plt.xticks(rotation=90)
plt.show()


### Creating charts for any 6 columns and show their distribution

using seaborn and matplotlib

In [None]:
# let's select any 6 numeric columns
columns_to_plot = df.select_dtypes(include='number').columns[2:8]

# Create subplots for the 6 columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(columns_to_plot, 1):
    # layout for plotting 2 rows with 3 column of graphs
    plt.subplot(2, 3, i)
    # plotting the data with kde (kernel density estimate for smooth curve over data)(we can remove it by setting kde=False)
    sns.histplot(df[col], kde=True, color='skyblue') 
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()
