# Kriss Sitapara Assignment 1B


In [48]:
#libraries
import pandas as pd
import torch
import matplotlib.pyplot as plt
import torch.nn as nn

To load the datasel, I moved the kaggle.json file from the api into this and downloaded the dataset that way

# Optimization algorithms for linear regression

# Logistic Regression


**______________________________________________________________**

**Task 1: Data Preprocessing**

**______________________________________________________________**

In [None]:

!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c avazu-ctr-prediction
!unzip avazu-ctr-prediction.zip -d /content/


Downloading avazu-ctr-prediction.zip to /content
 99% 1.18G/1.19G [00:09<00:00, 238MB/s]
100% 1.19G/1.19G [00:09<00:00, 140MB/s]
Archive:  avazu-ctr-prediction.zip
  inflating: /content/sampleSubmission.gz  
  inflating: /content/test.gz        
  inflating: /content/train.gz       


In [51]:
#loading data
def load_data(n_rows=100_000):
    chunksize = 10 ** 6
    filename = '/content/train.gz'
    chunks = []

    with pd.read_csv(filename, chunksize=chunksize, compression='gzip', nrows=n_rows) as reader:
        for chunk in reader:
            chunks.append(chunk)
    data = pd.concat(chunks)
    return data

data = load_data(300_000)
data_train = data.sample(10000)


In [52]:
#preprocess
def preprocess_data(data):
    data = data.drop(columns=['id', 'device_id', 'device_ip', 'site_id', 'site_domain', 'app_id', 'app_domain'])
    data = data.dropna()

    selected_features = ['click', 'hour', 'banner_pos', 'site_category', 'app_category',
                         'device_type', 'device_conn_type']
    data = data[selected_features]
    data_encoded = pd.get_dummies(data, drop_first=True)

    if 'hour' in data_encoded.columns:
        hour_mean = data_encoded['hour'].mean()
        hour_std = data_encoded['hour'].std()
        data_encoded['hour'] = (data_encoded['hour'] - hour_mean) / hour_std

    return data_encoded

data_preprocessed = preprocess_data(data_train)

In [53]:

X = data_preprocessed.drop('click', axis=1).values
y = data_preprocessed['click'].values

X = X.astype('float32')
y = y.astype('float32')

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

print("X Tensor shape:", X_tensor.shape)
print("y Tensor shape:", y_tensor.shape)


X Tensor shape: torch.Size([10000, 31])
y Tensor shape: torch.Size([10000])


**______________________________________________________________**

**Task 2: Logistic Regression**

**______________________________________________________________**

In [None]:

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)  # Linear layer

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

input_dim = X_tensor.shape[1]
model = LogisticRegressionModel(input_dim)

criterion = nn.BCELoss()
learning_rate = 0.15
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


num_epochs = 100
loss_values = []

for epoch in range(num_epochs):
    y_pred = model(X_tensor)
    loss = criterion(y_pred.squeeze(), y_tensor)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    loss_values.append(loss.item())
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

plt.plot(range(num_epochs), loss_values, color='purple')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Time (learning rate 0.01)')
plt.show()


In [None]:
weights = model.linear.weight
bias = model.linear.bias
alpha = torch.matmul(X_tensor, weights.T) + bias
y_scores = torch.sigmoid(alpha)
y_scores_unique = torch.unique(y_scores)
sorted_scores_unique, _ = torch.sort(y_scores_unique, descending=True)

precisions = []
recalls = []
thresholds = []
tp, fp = 0, 0
total_positives = torch.sum(y_tensor).item()


for threshold in sorted_scores_unique:
    y_pred = (y_scores >= threshold).float()
    tp = ((y_pred == 1) & (y_tensor == 1)).sum().item()
    fp = ((y_pred == 1) & (y_tensor == 0)).sum().item()
    precision = tp / (tp + fp + 1e-10)
    recall = tp / total_positives

    precisions.append(precision)
    recalls.append(recall)
    thresholds.append(threshold.item())


precisions = torch.tensor(precisions)
recalls = torch.tensor(recalls)
plt.plot(recalls.numpy(), precisions.numpy(), marker='.', color='purple')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision vs Recall Curve (Logistic Regression)')
plt.show()
