From 7e1df0e0c4bffdeb8249536aea5ec6b3b043135f Mon Sep 17 00:00:00 2001 From: IsaVia777 Date: Wed, 16 Oct 2019 15:03:18 -0400 Subject: [PATCH 1/3] make_dataset returns a dataframe [userID|like_id_multihot] --- project/likes_preprocessing.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py index 5b43c0f..b6409bc 100644 --- a/project/likes_preprocessing.py +++ b/project/likes_preprocessing.py @@ -1,5 +1,6 @@ import tensorflow as tf import tensorflow_hub as hub +import pandas as pd from typing import * def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset: @@ -13,5 +14,42 @@ def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset: tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector. """ # TODO + #open Relationship.csv file + df = pd.read_csv(input_dir) + df = df.drop(['index'], axis=1) + + #Save unique userID column + userid = df['userid'].unique() + + #Create the multihot matrix + relHot1 = pd.get_dummies(df, columns=["like_id"]) + relHot1 = relHot1.groupby(['userid']).sum() + + #Insert the userID column + relHot1.insert(0, "userid", userid) + + # save to csv + #relHot1.to_csv('multiHot.csv', index=None, header=True) + + return relHot1 + + + raise NotImplementedError() + +#Isa: working on making the likes onehotmatrix + +#It isnt possible to make a oneHot matrix with all profile entries + +#Sol 1: Make batches +""" +slice the data frame into batches (for loop) +create onehot_mat_i +store it into an array? +List of onehot_mat +""" + +#Sol 1.5: Get ride of the likes with a freq < LIKE_FREQ_CUTOFF + +#Sol2: Dimension reduction solutions ex:PCA \ No newline at end of file From f8703e41ca535b2519e6bdda4a620fb2c0d3b98a Mon Sep 17 00:00:00 2001 From: IsaVia777 Date: Fri, 18 Oct 2019 15:13:07 -0400 Subject: [PATCH 2/3] make_multihot_like_mat returns a multihot mat of likes with count > COUNT_CUTOFF --- project/likes_preprocessing.py | 54 +++++++++++++++------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py index b6409bc..d32cb01 100644 --- a/project/likes_preprocessing.py +++ b/project/likes_preprocessing.py @@ -3,7 +3,10 @@ import pandas as pd from typing import * -def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset: +#def make_dataset(input_dir: str, userids: List[str], COUNT_CUTOFF: int, saveTocsv: bool) -> tf.data.Dataset: + + +def make_multihot_like_mat(input_dir: str, userids: List[str], COUNT_CUTOFF: int, saveTocsv: bool): """Creates the preprocessed text dataset for the given userid's. Arguments: @@ -13,43 +16,34 @@ def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset: Returns: tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector. """ - # TODO - #open Relationship.csv file + # Get raw data df = pd.read_csv(input_dir) - df = df.drop(['index'], axis=1) - - #Save unique userID column - userid = df['userid'].unique() - - #Create the multihot matrix - relHot1 = pd.get_dummies(df, columns=["like_id"]) - relHot1 = relHot1.groupby(['userid']).sum() - - #Insert the userID column - relHot1.insert(0, "userid", userid) - - # save to csv - #relHot1.to_csv('multiHot.csv', index=None, header=True) + df = df.drop(['Unnamed: 0'], axis=1) - return relHot1 + freq_like_id = df["like_id"].value_counts() + likes_kept = freq_like_id[freq_like_id > COUNT_CUTOFF] + likes_kept_inds = likes_kept.keys() + filtered_table = df[df["like_id"].isin(likes_kept_inds)] + relHot = pd.get_dummies(filtered_table, columns=["like_id"]) + relHot = relHot.groupby(['userid']).sum() + if saveTocsv: + # create a userid row + userid = relHot.index + relHot.insert(0, "userid", userid) - raise NotImplementedError() + # create string: Relation_Multihot_CUTOFF.csv + PATH = "/home/mila/teaching/user07/IsabelleWorkshop/" + output_filename = "Relation_Multihot_" + str(COUNT_CUTOFF) + ".csv" + # save to csv + relHot.to_csv(PATH + output_filename, index=None, header=True) + relHot = relHot.drop(["userid"], axis=1) -#Isa: working on making the likes onehotmatrix -#It isnt possible to make a oneHot matrix with all profile entries + return relHot -#Sol 1: Make batches -""" -slice the data frame into batches (for loop) -create onehot_mat_i -store it into an array? -List of onehot_mat -""" -#Sol 1.5: Get ride of the likes with a freq < LIKE_FREQ_CUTOFF -#Sol2: Dimension reduction solutions ex:PCA \ No newline at end of file + raise NotImplementedError() \ No newline at end of file From 8d54aa6b8fec82a0c83be2870df87628439742ff Mon Sep 17 00:00:00 2001 From: IsaVia777 Date: Fri, 18 Oct 2019 15:27:55 -0400 Subject: [PATCH 3/3] added some explanations of the arguments and returned object --- project/likes_preprocessing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py index d32cb01..4d5a1ba 100644 --- a/project/likes_preprocessing.py +++ b/project/likes_preprocessing.py @@ -12,9 +12,11 @@ def make_multihot_like_mat(input_dir: str, userids: List[str], COUNT_CUTOFF: int Arguments: input_dir {str} -- the parent input directory userids {List[str]} -- the list of userids + COUNT_CUTOFF {int} -- minimal frequency of a like + saveTocsv {bool} -- save the multihot matrix to Isabelle's directory Returns: - tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector. + relHot -- multihot matrix of the like_id. Rows are indexed with userid """ # Get raw data df = pd.read_csv(input_dir)