# Loan Prediction: Exploratory Data Analysis & Preprocessing

## 1. Load Data

In [None]:
import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.preprocessing import LabelEncoder\n\n# Load the dataset\ndf = pd.read_csv('data/loan_train.csv')\n\n# Display the first few rows\ndf.head()

## 2. Basic Information

In [None]:
df.info()

## 3. Summary Statistics

In [None]:
df.describe()

## 4. Missing Values

In [None]:
df.isnull().sum()

## 5. Handle Missing Values

In [None]:
# Fill missing values for categorical features with mode\ndf['Gender'].fillna(df['Gender'].mode()[0], inplace=True)\ndf['Married'].fillna(df['Married'].mode()[0], inplace=True)\ndf['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)\ndf['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)\n\n# Fill missing values for numerical features with median\ndf['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)\ndf['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)\ndf['Credit_History'].fillna(df['Credit_History'].median(), inplace=True)\n\n# Check if all missing values are handled\ndf.isnull().sum()

## 6. Data Visualization

### 6.1 Categorical Features

In [None]:
plt.figure(figsize=(12, 8))\ncategorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']\nfor i, feature in enumerate(categorical_features):\n    plt.subplot(3, 3, i + 1)\n    sns.countplot(x=feature, data=df)\n    plt.title(f'Count of {feature}')\nplt.tight_layout()

### 6.2 Numerical Features

In [None]:
plt.figure(figsize=(12, 5))\nnumerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']\nfor i, feature in enumerate(numerical_features):\n    plt.subplot(1, 3, i + 1)\n    sns.histplot(df[feature], kde=True)\n    plt.title(f'Distribution of {feature}')\nplt.tight_layout()

### 6.3 Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 8))\ncorr = df.corr(numeric_only=True)\nsns.heatmap(corr, annot=True, cmap='coolwarm')\nplt.title('Correlation Heatmap')

## 7. Categorical Feature Encoding

In [None]:
le = LabelEncoder()\nfor col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Dependents']:\n    df[col] = le.fit_transform(df[col])\n\ndf.head()