# Coding for Economists - Session 7

***

## 1. Setup Environment

In [None]:
%conda install imbalanced-learn -y

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Turn on copy on write
pd.options.mode.copy_on_write = False

## 2. Read Data

In [None]:
# We use the Titanic dataset hosted on GitHub as an example.
data_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(data_url)
print("Initial dataset shape:", df.shape)
df.head()

- __PassengerId__: A unique identifier for each passenger.
- __Survived__: Survival status of the passenger (0 = Did not survive, 1 = Survived).
- __Pclass__: Ticket class (1 = First class, 2 = Second class, 3 = Third class).
- __Name__: The full name of the passenger.
- __Sex__: The gender of the passenger.
- __Age__: The age of the passenger.
- __SibSp__: Number of siblings and spouses aboard.
- __Parch__: Number of parents and children aboard.
- __Ticket__: Ticket number of the passenger.
- __Fare__: The amount of money paid for the ticket.
- __Cabin__: Cabin number where the passenger was accommodated.
- __Embarked__: Port of embarkation ('S' (Southampton), 'C' (Cherbourg), and 'Q' (Queenstown)).

In [None]:
df.dtypes

## 3. Clean Data

### 3.1 Handle Missing Data

In [None]:
# Identify columns with missing values
print("Missing values per column:\n", df.isnull().sum())

In [None]:
# For demonstration purposes:
# - Fill missing 'Age' values with the median age.
df['Age'] = df['Age'].fillna(df['Age'].median())
# - Fill missing 'Embarked' values with the mode.
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

### 3.2 Remove Features

In [None]:
# Drop columns that may not be useful for modeling
# Here, we remove identifiers and columns with high missing values.
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
print("Missing values per column:\n", df.isnull().sum())
df.head()

## 4. Preprocessing

### 4.1 Encode the Categorical Features

In [None]:
# We will encode 'Sex' and 'Embarked'. Pandas' get_dummies creates binary indicator columns.
df_encoded = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)
df_encoded.head()

### 4.2 Scale the Numeric Features

In [None]:
# We'll scale the numeric features for better model performance.
# First, identify numeric columns (excluding the target column 'Survived')
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']

# Initialize the StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])
df_encoded.head()

## 5 Prepare Train/Test Sets

### 5.1 Train/Test Split

In [None]:
# Split the data into features and labels variable
X = df_encoded.drop('Survived', axis=1)
y = df_encoded['Survived']

In [None]:
# Split into training and testing sets (80/20 split)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=1234, stratify=y # stratify=y ensures distribution of y is preserved in both training and testing sets
)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

### 5.2 Re-balance the Training Set

In [None]:
# This step is useful if the classes in the target variable are imbalanced.
# Synthetic Minority Oversampling Technique (SMOTE)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1234)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Original training target distribution:\n", y_train.value_counts())
print("Resampled training target distribution:\n", pd.Series(y_train_res).value_counts())

In [None]:
# Visualize the distribution of the target variable before and after re-balancing
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
y_train.value_counts().plot(kind='bar', ax=ax[0], title='Before SMOTE')
pd.Series(y_train_res).value_counts().plot(kind='bar', ax=ax[1], title='After SMOTE')
plt.tight_layout()
plt.show()