In [None]:
import pandas as pd

# Define the column names for the dataset
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Load the dataset from the UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, names=column_names, na_values='?', sep=', ', engine='python')

# Display the first few rows of the dataframe
df.head()

In [None]:
# Display the shape of the dataframe
print('Shape of the dataframe:', df.shape)

# Display the data types of the columns
print('\nData types of the columns:')
print(df.dtypes)

# Display the number of missing values in each column
print('\nNumber of missing values in each column:')
print(df.isnull().sum())

# Display the number of unique values in each column
print('\nNumber of unique values in each column:')
print(df.nunique())

# Display the distribution of the target variable (income)
print('\nDistribution of the target variable (income):')
print(df['income'].value_counts())

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Encode categorical variables
encoder = LabelEncoder()
for column in df_imputed.select_dtypes(include=['object']).columns:
    df_imputed[column] = encoder.fit_transform(df_imputed[column])

# Display the first few rows of the preprocessed dataframe
df_imputed.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the visualization
sns.set(style='whitegrid')

# Create a count plot of the 'education' column
plt.figure(figsize=(14, 6))
sns.countplot(x='education', data=df_imputed, palette='Set3')
plt.title('Distribution of Education Levels')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Create a count plot of the 'occupation' column
plt.figure(figsize=(14, 6))
sns.countplot(x='occupation', data=df_imputed, palette='Set3')
plt.title('Distribution of Occupations')
plt.xlabel('Occupation')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Create a count plot of the 'income' column
plt.figure(figsize=(6, 6))
sns.countplot(x='income', data=df_imputed, palette='Set3')
plt.title('Distribution of Income')
plt.xlabel('Income')
plt.ylabel('Count')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split the data into features and target variable
X = df_imputed.drop('income', axis=1)
y = df_imputed['income']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Display the classification report
print(classification_report(y_test, y_pred))