In [None]:
import pandas as pd
import numpy as np
import random
import string

# Set a seed for reproducibility
np.random.seed(0)

# Generate a dataframe with 1000 rows and 5 columns
df = pd.DataFrame(np.random.randn(1000, 5), columns=list('ABCDE'))

# Introduce missing values in the first column
df.loc[np.random.choice(df.index, 100), 'A'] = np.nan

# Introduce some outliers in the second column
df.loc[df.sample(5).index, 'B'] = df['B'].mean() + 10*df['B'].std()

# Create a categorical variable in the third column
df['C'] = np.random.choice(list(string.ascii_uppercase), df.shape[0])

# Display the first few rows of the dataframe
df.head()

In [None]:
# Display the shape of the dataframe
print('Shape of the dataframe:', df.shape)

# Display the data types of the columns
print('\nData types of the columns:')
print(df.dtypes)

# Display the number of missing values in each column
print('\nNumber of missing values in each column:')
print(df.isnull().sum())

# Display the number of unique values in the categorical column
print('\nNumber of unique values in the categorical column:')
print(df['C'].nunique())

# Display the summary statistics of the numerical columns
print('\nSummary statistics of the numerical columns:')
print(df.describe())

In [None]:
from sklearn.impute import SimpleImputer
from scipy import stats

# Handle missing values in column 'A' by imputing the mean of the column
imputer = SimpleImputer(strategy='mean')
df['A'] = imputer.fit_transform(df['A'].values.reshape(-1, 1))

# Handle outliers in column 'B' by replacing them with the median of the column
z_scores = np.abs(stats.zscore(df['B']))
df.loc[z_scores > 3, 'B'] = df['B'].median()

# Display the first few rows of the cleaned dataframe
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode the categorical variable in column 'C'
encoder = LabelEncoder()
df['C'] = encoder.fit_transform(df['C'])

# Normalize the numerical variables
scaler = StandardScaler()
df[['A', 'B', 'D', 'E']] = scaler.fit_transform(df[['A', 'B', 'D', 'E']])

# Display the first few rows of the preprocessed dataframe
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode the categorical variable 'C' using label encoding
label_encoder = LabelEncoder()
df['C'] = label_encoder.fit_transform(df['C'])

# Normalize the numerical variables using standard scaling
scaler = StandardScaler()
df[['A', 'B', 'D', 'E']] = scaler.fit_transform(df[['A', 'B', 'D', 'E']])

# Display the first few rows of the preprocessed dataframe
df.head()