In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Exploring Numeric Data

In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/student_data.csv")
#data = pd.read_csv("student_data.csv")

# Display numerical statistics
print("Statistical Summary of Numerical Data:")
print(data.describe())


Statistical Summary of Numerical Data:
         RollNo       Math    Science    English
count  50.00000  50.000000  50.000000  50.000000
mean   25.50000  65.000000  68.000000  69.400000
std    14.57738   7.142857   7.142857   7.159979
min     1.00000  55.000000  58.000000  60.000000
25%    13.25000  60.000000  63.000000  64.000000
50%    25.50000  65.000000  68.000000  69.000000
75%    37.75000  70.000000  73.000000  74.000000
max    50.00000  75.000000  78.000000  80.000000


In [None]:
print(data[['Math', 'Science', 'English']].describe())


            Math    Science    English
count  50.000000  50.000000  50.000000
mean   65.000000  68.000000  69.400000
std     7.142857   7.142857   7.159979
min    55.000000  58.000000  60.000000
25%    60.000000  63.000000  64.000000
50%    65.000000  68.000000  69.000000
75%    70.000000  73.000000  74.000000
max    75.000000  78.000000  80.000000


#Exploring Categorical Data

In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/student_data.csv")
# Display categorical column values
print("Gender Distribution:")
print(data['Gender'].value_counts())

print("\nDepartment Distribution:")
print(data['Department'].value_counts())

Gender Distribution:
Gender
Male      25
Female    25
Name: count, dtype: int64

Department Distribution:
Department
CSE    20
ECE    20
EEE    10
Name: count, dtype: int64


#Exploring Relationship Between Variables

In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/student_data_1.csv")

# Check missing values
print("Missing Values in Each Column:")
print(data.isnull().sum())

# Check duplicate records
print("\nNumber of Duplicate Rows:")
print(data.duplicated().sum())

Missing Values in Each Column:
RollNo        0
Math          2
Science       1
English       0
Gender        0
Department    3
dtype: int64

Number of Duplicate Rows:
0


#Data Remediation

In [None]:
# Fill missing numerical values with mean
data.fillna(data.mean(numeric_only=True), inplace=True)


# Remove duplicate rows
data.drop_duplicates(inplace=True)

print("Data after cleaning:")
print(data.isnull().sum())

Data after cleaning:
RollNo        0
Math          0
Science       0
English       0
Gender        0
Department    3
dtype: int64


In [None]:
# One-Hot Encoding
encoded_data = pd.get_dummies(data, columns=['Gender', 'Department'])

print("Encoded Dataset:")
print(encoded_data.head())


Encoded Dataset:
   RollNo  Math  Science  English  Gender_Female  Gender_Male  Department_CSE  \
0       1  55.0     58.0       60          False         True            True   
1       2  60.0     63.0       64           True        False           False   
2       3  65.0     68.0       69          False         True           False   
3       4  70.0     73.0       74           True        False            True   
4       5  75.0     78.0       80          False         True           False   

   Department_ECE  Department_EEE  
0           False           False  
1            True           False  
2           False            True  
3           False           False  
4            True           False  


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/student_data_1.csv")

print("Missing values BEFORE handling:")
print(data.isnull().sum())

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill numerical columns with mean
for col in numerical_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Fill categorical columns with mode
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("\nMissing values AFTER handling:")
print(data.isnull().sum())


Missing values BEFORE handling:
RollNo        0
Math          2
Science       1
English       0
Gender        0
Department    3
dtype: int64

Missing values AFTER handling:
RollNo        0
Math          0
Science       0
English       0
Gender        0
Department    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [None]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/student_data_1.csv")

print("Missing values BEFORE handling:")
print(data.isnull().sum())

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill numerical columns with mean
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].mean())

# Fill categorical columns with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

print("\nMissing values AFTER handling:")
print(data.isnull().sum())

# ðŸ”¹ Save the preprocessed data to a new CSV file
data.to_csv("student_data_preprocessed.csv", index=False)

print("\nPreprocessed data saved as 'student_data_preprocessed.csv'")


Missing values BEFORE handling:
RollNo        0
Math          2
Science       1
English       0
Gender        0
Department    3
dtype: int64

Missing values AFTER handling:
RollNo        0
Math          0
Science       0
English       0
Gender        0
Department    0
dtype: int64

Preprocessed data saved as 'student_data_preprocessed.csv'


In [23]:
import pandas as pd

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/ML Datasets/Titanic-Dataset.csv")

print("Missing values BEFORE handling:")
print(data.isnull().sum())

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill numerical columns with mean
for col in numerical_cols:
    data[col] = data[col].fillna(data[col].mean())

# Fill categorical columns with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

print("\nMissing values AFTER handling:")
print(data.isnull().sum())

# ðŸ”¹ Save the preprocessed data to a new CSV file
data.to_csv("Titanic_data_preprocessed.csv", index=False)

print("\nPreprocessed data saved as 'Titanic_data_preprocessed.csv'")


Missing values BEFORE handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values AFTER handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Preprocessed data saved as 'student_data_preprocessed.csv'
