# Part 1: Machine Learning Workflow and Ethics
*   Step 1: Data Preparation
*   Step 2: Data Visualization
*   Step 3: Model Evaluation








# 1.0 Data Preparation

# 1.1 Data Import

In [None]:
# 1.1 DATA IMPORT
!pip install datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

print("Loading California Housing dataset...")
california_housing = load_dataset("leostelon/california-housing")
housing_data = pd.DataFrame(california_housing['train'])  # Convert to Pandas DataFrame
print("California Housing dataset loaded successfully.")

# Display dataset
print(housing_data.head())

: 

# 1.2 Data Exploration

In [None]:
# 1.2 DATA EXPLORATION (NOT THAT INPORTANT, JUST SHOWING DATA USING DIFFERENT METHOD)
# Structure of data
print("****** Structure of Data ******")
print(housing_data.info())

# Summary statistics
print("\n\n****** Summary Statistics ******")
print(housing_data.describe())

# Dimensions of the dataset
print("\n\n****** Dataset Dimensions ******")
print(f'Dataset dimensions: {housing_data.shape}')

# Column names
print("\n\n****** Column Names ******")
print(f'Column names: {housing_data.columns.tolist()}')

# First few and last few rows
print("\n\n****** First Few Rows ******")
print(housing_data.head())
print("\n\n****** Last Few Rows ******")
print(housing_data.tail())

# Check unique values in categorical column
print("\n\n****** Unique Values in Categorical Column (ocean_proximity) ******")
print(housing_data['ocean_proximity'].value_counts())

# Display missing values count
print("\n\n****** Missing Values in Dataset ******")
missing_values = housing_data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

# 1.3 Data Cleaning

In [None]:
# 1.3 DATA CLEANING
# Check for missing values
print("****** Checking for Missing Values ******")
missing_values = housing_data.isnull().sum()
print(f'Missing Values per Column:\n{missing_values}')

# Drop rows with missing values (instead of filling with median/mean)
print("\n****** Removing Rows with Missing Values ******")
housing_data = housing_data.dropna()
print(f'Dataset dimensions after removing missing values: {housing_data.shape}')

# Check for duplicates
print("\n****** Checking for Duplicate Rows ******")
duplicates = housing_data.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

# 1.4 Data Preprocessing

In [None]:
# 1.4 DATA PRE-PROCESSING
# Encode categorical variable (one-hot encoding)
housing_data.loc[:, 'ocean_proximity_original'] = housing_data['ocean_proximity']
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'], drop_first=True)

# Correlation Heatmap (Check for irrelevant data)
plt.figure(figsize=(10, 8))
numeric_housing_data = housing_data.select_dtypes(include=['number'])
sns.heatmap(numeric_housing_data.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Housing Features")
plt.show()

# Feature Engineering (New features generated)
housing_data['avg_household_size'] = housing_data['households'] / housing_data['population']
housing_data['price_per_capita'] = housing_data['median_house_value'] / housing_data['population']

# Feature Selection [ Drop columns with high correlation (>0.9) ]
housing_data.drop(columns=["total_rooms", "total_bedrooms"], inplace=True)
print("\n****** Removing Columns with High Correlation ******")

# Display cleaned and preprocessed data
print(housing_data.head())

# 2.0 Data Visualization


# 2.1 Distribution of House Prices

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(housing_data['median_house_value'], bins=50, kde=True, color='blue')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.show()

# 2.2 House Value vs. Median Income

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=housing_data['median_income'], y=housing_data['median_house_value'], alpha=0.5)
plt.xlabel("Median Income")
plt.ylabel("Median House Value")
plt.title("House Value vs. Median Income")
plt.show()

# 2.3 Population vs. Median House Value

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=housing_data['population'], y=housing_data['median_house_value'], alpha=0.3)
plt.xlabel("Population")
plt.ylabel("Median House Value")
plt.title("House Value vs. Population")
plt.show()

# 2.4 Number of Houses by Ocean Proximity

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=housing_data['ocean_proximity_original'], palette='Set2')
plt.xlabel("Ocean Proximity")
plt.ylabel("Count")
plt.title("Number of Houses by Ocean Proximity")
plt.xticks(rotation=45)
plt.show()

# 2.5 Housing Density (Households vs. Population)

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=housing_data['households'], y=housing_data['population'], alpha=0.5)
plt.xlabel("Number of Households")
plt.ylabel("Population")
plt.title("Households vs. Population")
plt.show()

# 2.6 Distribution of Housing Median Age

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(housing_data['housing_median_age'], bins=30, kde=True, color='purple')
plt.xlabel("Housing Median Age")
plt.ylabel("Frequency")
plt.title("Distribution of Housing Age")
plt.show()

# 3.0 Model Development

# 3.1 Preprocess before Model Development

In [None]:
# Scale numerical features (SCALING MAKES MACHINE LEARNING MODEL PERFORMS BETTER)
# FOR EXAMPLE : Linear Regression, Logistic Regression, Support Vector Machines, K-Nearest Neighbors, K-Means Clustering
from sklearn.preprocessing import StandardScaler
numerical_features = housing_data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
housing_data[numerical_features] = scaler.fit_transform(housing_data[numerical_features])
print(housing_data.head())

# 3.2 Supervised Learning (Linear Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity_original'], drop_first=True)

# Define features and target
X = housing_data.drop(columns=['median_house_value'])  # Features
y = housing_data['median_house_value']  # Target variable

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluate model performance
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Linear Regression Performance:\nR-squared: {r2:.2f}\nMAE: {mae:.2f}\nMSE: {mse:.2f}")


# 3.3 Supervised Learning (Decision Tree)

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Train the Decision Tree model
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate model performance
r2_dt = r2_score(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)

print(f"Decision Tree Performance:\nR-squared: {r2_dt:.2f}\nMAE: {mae_dt:.2f}\nMSE: {mse_dt:.2f}")

# 3.4 Unsupervised Learning (K-Means Clustering)

# 4.0 Model Evaluation

In [None]:
from sklearn.cluster import KMeans

# Choose number of clusters (elbow method can be used to find optimal k)
kmeans = KMeans(n_clusters=3, random_state=42)
housing_data['cluster'] = kmeans.fit_predict(X)
print("K-Means clustering applied. Cluster labels added to dataset.")

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=housing_data['cluster'], palette='viridis')
plt.title("K-Means Clustering of Housing Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend(title="Cluster")
plt.show()

# 5.0 Model Interpretation


# 5.1 Regression Model Performance Comparison

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compare actual vs predicted values for linear regression
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.5)
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")
plt.title("Linear Regression: Actual vs Predicted")
plt.show()

# 5.2 Decision Tree Feature Importance

In [None]:
import pandas as pd

# Get feature importance from Decision Tree
feature_importance = pd.Series(dt_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance from Decision Tree")
plt.show()

# Part 2: Natural Language Processing and Deep Learning

# 6.0 Data Preparation

# 6.1 Data Import

In [None]:
!pip install datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

print("Loading Amazon Polarity dataset...")
# Load the Amazon Polarity dataset
amazon_polarity = load_dataset("fancyzhx/amazon_polarity")

# Convert to Pandas DataFrame
amazon_data = pd.DataFrame(amazon_polarity['train'])  # Training data
print("Amazon Polarity dataset loaded successfully.")

# Extract a sample of 100000 rows from the dataset
amazon_data = amazon_data.head(100000)

# Display first 5 rows of the dataset
print(amazon_data.head())

# 6.2 Text Preprocessing

In [None]:
# # # Uninstall conflicting packages
!pip uninstall -y numpy scipy gensim tensorflow keras

# Install compatible versions
!pip install numpy==1.26.4
!pip install scipy==1.13.1
!pip install gensim==4.3.3
!pip install --upgrade tensorflow
!pip install --upgrade keras
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    tokens = text.split()  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply text cleaning to the 'content' column
amazon_data['cleaned_content'] = amazon_data['content'].apply(clean_text)

# Display the first 5 rows after preprocessing
print(amazon_data[['content', 'cleaned_content']].head())

# 6.3 Tokenization

In [None]:
from sklearn.model_selection import train_test_split

# Tokenize the cleaned content
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')  # Limit vocab size to 10,000
tokenizer.fit_on_texts(amazon_data['cleaned_content'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(amazon_data['cleaned_content'])

# Pad sequences to ensure equal length
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Get the labels
labels = amazon_data['label'].values

print(f"Tokenized and padded sequences shape: {padded_sequences.shape}")

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


# 6.4 Pre-trained Embeddings

In [None]:
# Used to divide the words into different vector dimensions
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip

# Unzip the file
!unzip -q glove.6B.zip

print("GloVe embeddings downloaded and extracted successfully!")

# Load GloVe embeddings
embedding_dim = 100
glove_path = "glove.6B.100d.txt"  # Make sure to download GloVe embeddings before running
embedding_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

print(f"Loaded {len(embedding_index)} word vectors.")

# Create embedding matrix
word_index = tokenizer.word_index
num_words = min(10000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

print("Embedding matrix created successfully.")

# 7.0 Deep Learning Model Implementation

# 7.1 LSTM Model (Long Short-Term Memory)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# Define the LSTM model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(64, return_sequences=False))  # 64 LSTM units, return_sequences=False for final output
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(1, activation='sigmoid'))  # Binary classification (sigmoid for probability)

# Compute class weights from training data
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, 100))

# Print model summary
print(model.summary())

# Train the model
history = model.fit(
    X_train,             # Training data
    y_train,             # Training data
    validation_split=0.2,# Training labels
    epochs=10,           # Number of training epochs
    batch_size=32,       # Batch size for training
    validation_data=(X_test, y_test), # Data for validation
    class_weight=class_weight_dict
)


# 8.0 Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Predict on training data
predictions = (model.predict(X_test) > 0.5).astype(int)

# Classification report
print(classification_report(y_test, predictions))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions, labels=[0, 1])
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot accuracy and loss curves
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()