packages to install
torch==2.1.0
torchvision==0.16.0
tqdm==4.66.1
transformers==4.35.2
matplotlib==3.7.3
numpy==1.24.4
pandas==2.0.3
scikit-learn==1.3.1

In [None]:
import numpy as np 
import torch
import math
from torch.utils.data import Dataset
from typing import List
import pandas as pd
import torch.nn as nn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


csvname = 'UTKFaceAugmented.csv'
dataset = csvname

df = pd.read_csv(dataset)

print(len(df))
print(df.columns)
print(df.info())
df.drop(columns=['gender', 'race', 'uses_skincare'])
# I dropped these columns since they have no effect on what age one looks

#prepare data
categoric_columns = ['has_tiktok' , 'remembers_disco','max_annual_earnings', 'num_haircuts_life']
for i in range(len(categoric_columns)):
    print("Column: {categoric_columns[i]}")
    counts = df[categoric_columns[i]].value_counts()
    for label, count in counts.items():
            print("Label: '{label}' | Frequency: {count}")
    
keep_categoric_columns = ['has_tiktok' , 'remembers_disco','max_annual_earnings', 'num_haircuts_life']
for col in keep_categoric_columns:
     df = df.join(pd.get_dummies(df[col], dtype = 'int', prefix=col+'_cat'), how = 'outer')
    #populate data as 1's and 0's"






In [None]:

features = keep_categoric_columns
target = ['age', 'age_range']

# Features will be used to determine age
# Target is what we're trying to figure out which is age

train_df = df[features + target]
# Split dataset into train and val set
x_train, x_val, y_train, y_val = train_test_split(train_df[features], train_df[target], train_size=0.7)

# Split val set into val and test set
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, train_size=0.5)

#20% test data 80% train data 

print(x_train.shape)
print(x_test.shape)


In [None]:
from dataloader import CustomDataloader

In [None]:
x_train_encoded = pd.get_dummies(x_train)
x_val_encoded = pd.get_dummies(x_val)
y_train_encoded = pd.get_dummies(y_train)
y_val_encoded = pd.get_dummies(y_val)

# Convert the encoded DataFrame to a NumPy array
x_val_np = x_val_encoded.values.astype(np.float32)
x_train_np = x_train_encoded.values.astype(np.float32)
y_train_np = y_train_encoded.values.astype(np.float32)
y_val_np = y_val_encoded.values.astype(np.float32)

#format data
x_train, x_val = torch.Tensor(x_train_np), torch.Tensor(x_val_np)
y_train, y_val = torch.Tensor(y_train_np), torch.Tensor(y_val_np)
train_dataloader = CustomDataloader(x = x_train, y = y_train, batch_size=32)
val_dataloader = CustomDataloader(x = x_val, y = y_val, batch_size=32)
train_dataloader = CustomDataloader(x = x_train, y = y_train, batch_size=16, randomize=True)
val_dataloader = CustomDataloader(x = x_val, y = y_val, batch_size=16, randomize=False)


In [None]:
# Initialize the Linear Regression model
linear_regression = LinearRegression()

# Train the model
linear_regression.fit(x_train, y_train)

# Make predictions
y_pred = linear_regression.predict(x_train)

# Calculate Mean Squared Error (MSE) for evaluation
mse = mean_squared_error(y_train, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

In [None]:

# Plotting actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_train, y_pred, color='blue', label='Actual vs Predicted')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values (Linear Regression)')
plt.legend()
plt.show()

In [None]:

# Assuming y_pred contains the predicted values
plt.figure(figsize=(8, 6))
plt.hist(y_pred, bins=30, edgecolor='black')
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.title('Histogram of Predicted Values')
plt.grid(True)
plt.show()

In [None]:


# Initialize the Linear Regression model
linear_regression = LinearRegression()

# Lists to store training loss per iteration
loss_per_iteration = []

# Training loop (simulating epochs)
iterations = 100  # Number of iterations (simulating epochs)
for iteration in range(iterations):
    # Fit the model
    linear_regression.fit(x_train, y_train)
    
    # Make predictions
    y_pred = linear_regression.predict(x_train)
    
    # Calculate Mean Squared Error (MSE) for evaluation
    mse = mean_squared_error(y_train, y_pred)
    print(f"Iteration {iteration + 1}/{iterations} - Mean Squared Error (MSE): {mse:.4f}")
    
    # Append the MSE to the list for plotting
    loss_per_iteration.append(mse)

