In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import torch
from ignite.engine import create_supervised_trainer, create_supervised_evaluator, Events
from ignite.metrics import Accuracy, Loss
from torch.utils.data.dataloader import DataLoader
from torch import tensor
from sklearn.model_selection import train_test_split
import numpy as np

ModuleNotFoundError: No module named 'torch'

In [2]:
relation_df_train = pd.read_csv('../../new_data/Train/Relation/Relation.csv').drop(columns=['Unnamed: 0'])
relation_df_test = pd.read_csv('../../new_data/Public_Test/Relation/Relation.csv').drop(columns=['Unnamed: 0'])

train_set_like_ids = set(relation_df_train['like_id'])
test_set_like_ids = set(relation_df_test['like_id'])
common_like_ids = set(sorted(train_set_like_ids.intersection(test_set_like_ids)))

In [19]:
print("There are {} training page ids".format(len(train_set_like_ids)))
print("There are {} test page ids".format(len(test_set_like_ids)))
print("We have {} page ids that exist both in the train set and the validation set".format(len(common_like_ids)))

There are 536204 training page ids
There are 37073 test page ids
We have 23581 page ids that exist both in the train set and the validation set


In [20]:
def get_num_constructable_vectors(df):
    count = 0
    for userid in df['userid'].unique():
        user_likes = df[df['userid'] == userid]['like_id']
        if set(user_likes).intersection(common_like_ids):
            count += 1
    return count

In [22]:
train_constructable_vectors = get_num_constructable_vectors(relation_df_train)
test_constructable_vectors = get_num_constructable_vectors(relation_df_test)
print("Using only the common ids:")
print("We are able to construct a train vector for {}% of users".format(100*train_constructable_vectors/len(relation_df_train['userid'].unique())))
print("We are able to construct a test vector for {}% of users".format(100*test_constructable_vectors/len(relation_df_test['userid'].unique())))

We are able to construct a train vector for 90.46315789473684% of users
We are able to construct a test vector for 95.20958083832335% of users


We will now construct vectors for each user, of lenght len(common_ids)+1 where 0 in the vector represents he liked that page and 1 that he did. If all of the values end up as 0 we set the last index as 1 meaning the user liked a different page (one of the other 500.000 ones we left out)
Whenever there is a user that did not like one of the common ids, we

In [None]:
def get_user_likes(dataframe):
    return dataframe['like_id']

def get_user_like_vector(user_likes, common_like_ids):
    user_like_vector = np.zeros(len(common_like_ids)+1)
    for like_id in user_likes:
        page_liked_index = np.argwhere(common_like_ids == like_id)
        if page_liked_index.size > 0:
            user_like_vector[page_liked_index[0][0]] = 1

def get_features(dataframe, common_like_ids):
    user_ids = df['userid'].unique()
    like_feature_vectors = np.array([
        get_user_like_vector(user_likes, common_like_ids)
        for user_likes in dataframe['like_id']
    ])
    get_user_like_vector
            
user_ids, like_feature_vector = 
user_ids = train_set_like_ids['userid'].unique()
for user_id in user_ids:
    
user_likes = train_set_like_ids['userid']

user_like_vector = np.sum([])

In [5]:
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple
import torch.tensor
import numpy as np
from torch.utils.data.dataset import Dataset
import os.path.join

class FBRelationV2PreprocessedDataset(Dataset):
    def __init__(self, features: torch.tensor, labels: torch.tensor or None):
        if labels is not None:
            assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, int]:
        if self.labels is None:
            return self.features[idx]
        return self.features[idx], self.labels[idx]

    def __len__(self):
        return len(self.features)


class BasicNN(nn.Module):
    """
    Implements a simple neural network architecture
    """
    def __init__(self, num_inputs, hidden_layer_sizes, num_outputs):
        super().__init__()

        self.fc1 = nn.Linear(num_inputs, hidden_layer_sizes[0])
        self.fc2 = nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1])
        self.fc3 = nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2])
        self.fc4 = nn.Linear(hidden_layer_sizes[2], num_outputs)

    def forward(self, input):
        h = F.relu(self.fc1(input.float()))
        h = F.relu(self.fc2(h))
        h = F.relu(self.fc3(h))
        out = F.relu(self.fc4(h))
        return out


class RelationV2AgeEstimator():
    def __init__(self):
        # * 23581 common like ids in training and validation
        #   + 1 for "other"
        # * Ages 1-112
        self.neural_net = BasicNN(23582, [10000, 5000, 2500], 112)
        self.batch_size = 10
        self.learning_rate = 0.01
        self.num_epochs = 100
        self.predictions = []

    def fit(self, features, labels):
        x_train, x_test, y_train, y_test = train_test_split(
            features.reshape(-1, 23582),
            np.array([
                # Converting an age to one-hot. Example: '3' -> [0, 0, 1, 0, ...]
                np.eye(112)[np.array([age-1])].tolist()[0]
                for age in labels
            ]),
            train_size=0.8,
            shuffle=True
        )
        x_train = tensor(x_train).float()
        x_test = tensor(x_test).float()
        y_train = tensor(y_train).float()
        y_test = tensor(y_test).float()

        train_data_loader = DataLoader(
            dataset=FBRelationV2PreprocessedDataset(x_train, y_train),
            batch_size=self.batch_size,
            shuffle=True
        )

        valid_data_loader = DataLoader(
            dataset=FBRelationV2PreprocessedDataset(x_test, y_test),
            batch_size=self.batch_size,
            shuffle=True
        )

        trainer = create_supervised_trainer(
            model=self.neural_net,
            optimizer=torch.optim.Adam(self.neural_net.parameters(), self.learning_rate),
            loss_fn=torch.nn.MSELoss()
        )

        evaluator = create_supervised_evaluator(
            model=self.neural_net,
            metrics={
                'MSE': Loss(torch.nn.MSELoss())
            }
        )

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(trainer):
            evaluator.run(train_data_loader)
            metrics = evaluator.state.metrics
            print("Training Results - Epoch: {}. Avg MSE loss: {:.8f}"
                  .format(trainer.state.epoch, metrics['MSE']))

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(trainer):
            evaluator.run(valid_data_loader)
            metrics = evaluator.state.metrics

            print("Validation Results - Epoch {}. Avg MSE loss: {:.8f}".format(
                trainer.state.epoch,
                metrics['MSE']
            ))

        trainer.run(train_data_loader, max_epochs=self.num_epochs)

    def predict(self, features):
        features = np.array([feature.likes_preprocessed_v1 for feature in features]).reshape(-1, 2)

        test_data_loader = DataLoader(
            dataset=FBRelationV2PreprocessedDataset(features, None),
            batch_size=self.batch_size,
            shuffle=True
        )

        for batch_idx, (data) in enumerate(test_data_loader):
            output = self.neural_net(data)
            prediction = int_to_age_category(int(torch.max(output, 0).indices))
            self.predictions.append(prediction)

        return self.predictions

    
def pre_process_likes_v2(data_path: str) -> pd.DataFrame:
    original_csv_file_path = os.path.join(data_path, 'Relation', 'Relation.csv')
    preprocessed_csv_file_path = os.path.join(data_path, 'Relation', 'relation_preprocessed_raw_v2.csv')

    if os.path.isfile(preprocessed_csv_file_path):
        features = load_likes_csv_file(preprocessed_csv_file_path)
    else:
        relation_df = load_likes_csv_file(original_csv_file_path)
        user_ids = relation_df['user_id'].unique()
        users_like_vectors =
        like_counts_per_page = relation_df['like_id'].value_counts()
        features = relation_df.assign(
            user_id=like_counts_per_user.keys(),
            likes_given=like_counts_per_user.values
        )
        features = features.assign(
            pages_liked_sum_likes=np.array([
                np.array([
                    get_page_total_likes(page_id) for page_id in get_page_ids_liked_by_user(relation_df, user_id)
                ]).sum()
                for user_id in get_user_ids(features)
            ])
        )

    # Standardize features by removing the mean and scaling to unit variance
    features[features.columns[1:]] = features[features.columns[1:]].apply(
        lambda df: (df-df.mean())/df.std()
    ).fillna(0)

    return features
    
def create_common_like_ids(data_path):
    relation_df_train = pd.read_csv(os.path.join(data_path, 'Train/Relation/Relation.csv')).drop(columns=['Unnamed: 0'])
    relation_df_test = pd.read_csv(os.path.join(data_path, 'Public_Test/Relation/Relation.csv')).drop(columns=['Unnamed: 0'])

    train_set_like_ids = set(relation_df_train['like_id'])
    test_set_like_ids = set(relation_df_test['like_id'])
    common_like_ids = set(sorted(train_set_like_ids.intersection(test_set_like_ids)))
    
    
    
def read_training_data():
    # page ids that exist both in the training data and test data
    common_like_ids = np.genfromtxt('data/relation_v2_common_like_ids.csv', delimiter=',')
    
    pre_process_likes_v2(
        '../../new_data/Train/Relation/Relation.csv'
        '../../new_data/Public_Test/Train/Relation/Relation.csv'
    )

In [None]:
create_common_like_ids('../../new_data/')

# # =============
# # main function
# # =============
# features, labels = read_training_data()
# #features = read_prediction_data()

# relation_age_estimator_v2 = RelationV2AgeEstimator()
# relation_age_estimator_v2.fit(features, labels)
# #relation_age_estimator_v2.predict(features)