In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/MLP/GA/Week8/data_for_large_scale.csv')

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier


In [4]:
print("First five rows of the dataset:")
print(df.head())

# Step 3: Separate features (X) and target (y)
X = df.drop(columns=['Target'])
y = df['Target']

# Step 4: Convert dataframe X and series y into arrays
X_array = X.values
y_array = y.values

# Step 5: Split the dataset (test_size=0.3 and random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.3, random_state=10)

# Step 6: Reshape the dataset so each entry has 90 samples
# Determine how many complete blocks of 90 samples we have
n_samples = 90
num_train_samples = (len(X_train) // n_samples) * n_samples
num_test_samples = (len(X_test) // n_samples) * n_samples

# Reshape the arrays
X_train = X_train[:num_train_samples].reshape(-1, n_samples, X_train.shape[1])
y_train = y_train[:num_train_samples].reshape(-1, n_samples)
X_test = X_test[:num_test_samples].reshape(-1, n_samples, X_test.shape[1])
y_test = y_test[:num_test_samples].reshape(-1, n_samples)

# Step 7: Initialize and train the SGDRegressor using partial_fit for 5 iterations
sgd_regressor = SGDRegressor(random_state=10, max_iter=1, warm_start=True)

# Train for 5 iterations over the training batches
for iteration in range(5):
    for i in range(X_train.shape[0]):
        sgd_regressor.partial_fit(X_train[i], y_train[i])

# Step 8: Evaluate the model using the test set
# Predict on each batch and then concatenate predictions
y_pred = np.concatenate([sgd_regressor.predict(X_test[i]) for i in range(X_test.shape[0])])
mse = mean_squared_error(y_test.flatten(), y_pred)
r2 = r2_score(y_test.flatten(), y_pred)

# Extract model parameters
intercept = sgd_regressor.intercept_[0]
coefficients = sgd_regressor.coef_
feature_3_coeff = coefficients[2]  # Assuming zero-index: feature-3 is at index 2
feature_5_coeff = coefficients[4]  # Assuming feature-5 is at index 4

# Print the answers to the questions
print("\nResults:")
print("1. Number of features in the dataset:", X.shape[1])
print("2. Intercept value (closest answer):", intercept)
print("3. Coefficient corresponding to 'Feature-3':", feature_3_coeff)
print("4. R² score for test data:", r2)
print("5. Coefficient corresponding to 'Feature-5' after 5 iterations:", feature_5_coeff)
print("Mean Squared Error on test data:", mse)

First five rows of the dataset:
   Feature-1  Feature-2  Feature-3  Feature-4  Feature-5  Feature-6  \
0     -1.580     1.0500      1.060     -0.440      0.451    -0.0348   
1     -0.832    -0.8660     -1.340      0.138      1.180     0.7330   
2     -0.237     2.0900     -3.930      0.296      0.352    -0.5010   
3     -1.170    -1.1300     -1.090      1.120      0.312     0.1830   
4      0.260    -0.0273      0.925     -1.150     -1.390     0.0251   

   Feature-7  Feature-8  Feature-9  Feature-10  Target  
0      0.643     0.2650      0.268      -0.851    84.7  
1     -1.410     0.1350     -0.088      -1.550  -211.0  
2      0.961    -0.0287      1.820       0.938   -96.9  
3      0.448    -0.8190     -1.010      -1.080  -152.0  
4      0.627     0.0950     -0.280      -0.848   -57.7  

Results:
1. Number of features in the dataset: 10
2. Intercept value (closest answer): -0.003559159504897959
3. Coefficient corresponding to 'Feature-3': 81.23931579719752
4. R² score for test data:

In [9]:
# Dataset URL and column names
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt"
col_names = ['OSM_ID', 'LONGITUDE', 'LATITUDE', 'ALTITUDE']

# Parameters for reading in chunks
chunksize = 20000

# ----- Step 1: First Pass - Scale the Data and Count Total Samples -----
scaler = StandardScaler()
total_samples = 0

# Create an iterator for the dataset, note the separator is comma
data_iter = pd.read_csv(url, sep=",", header=None, names=col_names, chunksize=chunksize, iterator=True)

# Process each chunk to update the scaler and count samples
for chunk in data_iter:
    total_samples += len(chunk)
    # Extract features to be scaled: OSM_ID, LONGITUDE, LATITUDE
    X_chunk = chunk[['OSM_ID', 'LONGITUDE', 'LATITUDE']].values.astype(float)
    scaler.partial_fit(X_chunk)

print("Total number of samples in the dataset:", total_samples)
# (Use the printed number to answer Q6)

# ----- Step 2: Train the SGDRegressor using partial_fit in 7 iterations -----
# Initialize the SGDRegressor with random_state=10 and warm_start=True so that
# we can call partial_fit repeatedly without reinitializing the model.
sgd_reg = SGDRegressor(random_state=10, warm_start=True)

n_epochs = 7
for epoch in range(n_epochs):
    # Reinitialize the iterator for each epoch
    data_iter = pd.read_csv(url, sep=",", header=None, names=col_names, chunksize=chunksize, iterator=True)
    for chunk in data_iter:
        # Prepare features and target
        X_chunk = chunk[['OSM_ID', 'LONGITUDE', 'LATITUDE']].values.astype(float)
        y_chunk = chunk['ALTITUDE'].values.astype(float)

        # Scale features using the fitted scaler
        X_chunk_scaled = scaler.transform(X_chunk)

        # Incrementally train the model on this chunk
        sgd_reg.partial_fit(X_chunk_scaled, y_chunk)

# ----- Step 3: Report the Model Parameters -----
# The intercept and coefficients are available after the 7th iteration.
print("\nModel Parameters after 7 iterations:")
print("Intercept (closest option):", sgd_reg.intercept_[0])
print("Coefficient for LONGITUDE (closest option):", sgd_reg.coef_[1])

Total number of samples in the dataset: 434874

Model Parameters after 7 iterations:
Intercept (closest option): 20.950528897559135
Coefficient for LONGITUDE (closest option): 2.7507401692594216


In [11]:
# Step 1: Load Iris dataset and split (test_size=0.2, random_state=10)
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Step 2: Scale the data using Normalizer
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

# Step 3: Evaluate KNN classifier for different k values
k_values = [2, 3, 4]
accuracies = {}

print("Test set accuracy for different k values:")
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_norm, y_train)
    y_pred = knn.predict(X_test_norm)
    acc = accuracy_score(y_test, y_pred)
    accuracies[k] = acc
    print(f"k = {k}: Accuracy = {acc:.2f}")

# Determine which k gave the best accuracy
best_k = max(accuracies, key=accuracies.get)
print("\nBest k value based on test set accuracy:", best_k)

# For k = 3: Compute accuracy and weighted F1 score
knn_k3 = KNeighborsClassifier(n_neighbors=3)
knn_k3.fit(X_train_norm, y_train)
y_pred_k3 = knn_k3.predict(X_test_norm)
accuracy_k3 = accuracy_score(y_test, y_pred_k3)
f1_weighted_k3 = f1_score(y_test, y_pred_k3, average='weighted')

print("\nFor k = 3:")
print("Test Accuracy:", accuracy_k3)
print("Weighted F1 Score:", f1_weighted_k3)

Test set accuracy for different k values:
k = 2: Accuracy = 0.97
k = 3: Accuracy = 0.97
k = 4: Accuracy = 0.97

Best k value based on test set accuracy: 2

For k = 3:
Test Accuracy: 0.9666666666666667
Weighted F1 Score: 0.9671111111111111
