From 7e1df0e0c4bffdeb8249536aea5ec6b3b043135f Mon Sep 17 00:00:00 2001
From: IsaVia777 <isabelle.viarouge@gmail.com>
Date: Wed, 16 Oct 2019 15:03:18 -0400
Subject: [PATCH 1/3] make_dataset returns a dataframe
 [userID|like_id_multihot]

---
 project/likes_preprocessing.py | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py
index 5b43c0f..b6409bc 100644
--- a/project/likes_preprocessing.py
+++ b/project/likes_preprocessing.py
@@ -1,5 +1,6 @@
 import tensorflow as tf
 import tensorflow_hub as hub
+import pandas as pd
 from typing import *
 
 def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset:
@@ -13,5 +14,42 @@ def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset:
         tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector.
     """
     # TODO
+    #open Relationship.csv file
+    df = pd.read_csv(input_dir)
+    df = df.drop(['index'], axis=1)
+
+    #Save unique userID column
+    userid = df['userid'].unique()
+
+    #Create the multihot matrix
+    relHot1 = pd.get_dummies(df, columns=["like_id"])
+    relHot1 = relHot1.groupby(['userid']).sum()
+
+    #Insert the userID column
+    relHot1.insert(0, "userid", userid)
+
+    # save to csv
+    #relHot1.to_csv('multiHot.csv', index=None, header=True)
+
+    return relHot1
+
+
+
     raise NotImplementedError()
 
+
+#Isa: working on making the likes onehotmatrix
+
+#It isnt possible to make a oneHot matrix with all profile entries
+
+#Sol 1: Make batches
+""" 
+slice the data frame into batches (for loop)
+create onehot_mat_i
+store it into an array?
+List of onehot_mat
+"""
+
+#Sol 1.5: Get ride of the likes with a freq < LIKE_FREQ_CUTOFF
+
+#Sol2: Dimension reduction solutions ex:PCA
\ No newline at end of file

From f8703e41ca535b2519e6bdda4a620fb2c0d3b98a Mon Sep 17 00:00:00 2001
From: IsaVia777 <isabelle.viarouge@gmail.com>
Date: Fri, 18 Oct 2019 15:13:07 -0400
Subject: [PATCH 2/3] make_multihot_like_mat returns a multihot mat of likes
 with count > COUNT_CUTOFF

---
 project/likes_preprocessing.py | 54 +++++++++++++++-------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py
index b6409bc..d32cb01 100644
--- a/project/likes_preprocessing.py
+++ b/project/likes_preprocessing.py
@@ -3,7 +3,10 @@
 import pandas as pd
 from typing import *
 
-def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset:
+#def make_dataset(input_dir: str, userids: List[str], COUNT_CUTOFF: int, saveTocsv: bool) -> tf.data.Dataset:
+
+
+def make_multihot_like_mat(input_dir: str, userids: List[str], COUNT_CUTOFF: int, saveTocsv: bool):
     """Creates the preprocessed text dataset for the given userid's.
     
     Arguments:
@@ -13,43 +16,34 @@ def make_dataset(input_dir: str, userids: List[str]) -> tf.data.Dataset:
     Returns:
         tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector.
     """
-    # TODO
-    #open Relationship.csv file
+    # Get raw data
     df = pd.read_csv(input_dir)
-    df = df.drop(['index'], axis=1)
-
-    #Save unique userID column
-    userid = df['userid'].unique()
-
-    #Create the multihot matrix
-    relHot1 = pd.get_dummies(df, columns=["like_id"])
-    relHot1 = relHot1.groupby(['userid']).sum()
-
-    #Insert the userID column
-    relHot1.insert(0, "userid", userid)
-
-    # save to csv
-    #relHot1.to_csv('multiHot.csv', index=None, header=True)
+    df = df.drop(['Unnamed: 0'], axis=1)
 
-    return relHot1
+    freq_like_id = df["like_id"].value_counts()
+    likes_kept = freq_like_id[freq_like_id > COUNT_CUTOFF]
+    likes_kept_inds = likes_kept.keys()
+    filtered_table = df[df["like_id"].isin(likes_kept_inds)]
 
+    relHot = pd.get_dummies(filtered_table, columns=["like_id"])
+    relHot = relHot.groupby(['userid']).sum()
 
+    if saveTocsv:
+        # create a userid row
+        userid = relHot.index
+        relHot.insert(0, "userid", userid)
 
-    raise NotImplementedError()
+        # create string: Relation_Multihot_CUTOFF.csv
+        PATH = "/home/mila/teaching/user07/IsabelleWorkshop/"
+        output_filename = "Relation_Multihot_" + str(COUNT_CUTOFF) + ".csv"
+        # save to csv
+        relHot.to_csv(PATH + output_filename, index=None, header=True)
 
+        relHot = relHot.drop(["userid"], axis=1)
 
-#Isa: working on making the likes onehotmatrix
 
-#It isnt possible to make a oneHot matrix with all profile entries
+    return relHot
 
-#Sol 1: Make batches
-""" 
-slice the data frame into batches (for loop)
-create onehot_mat_i
-store it into an array?
-List of onehot_mat
-"""
 
-#Sol 1.5: Get ride of the likes with a freq < LIKE_FREQ_CUTOFF
 
-#Sol2: Dimension reduction solutions ex:PCA
\ No newline at end of file
+    raise NotImplementedError()
\ No newline at end of file

From 8d54aa6b8fec82a0c83be2870df87628439742ff Mon Sep 17 00:00:00 2001
From: IsaVia777 <isabelle.viarouge@gmail.com>
Date: Fri, 18 Oct 2019 15:27:55 -0400
Subject: [PATCH 3/3] added some explanations of the arguments and returned
 object

---
 project/likes_preprocessing.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/project/likes_preprocessing.py b/project/likes_preprocessing.py
index d32cb01..4d5a1ba 100644
--- a/project/likes_preprocessing.py
+++ b/project/likes_preprocessing.py
@@ -12,9 +12,11 @@ def make_multihot_like_mat(input_dir: str, userids: List[str], COUNT_CUTOFF: int
     Arguments:
         input_dir {str} -- the parent input directory
         userids {List[str]} -- the list of userids
+        COUNT_CUTOFF {int} -- minimal frequency of a like
+        saveTocsv {bool} -- save the multihot matrix to Isabelle's directory
     
     Returns:
-        tf.data.Dataset -- the preprocessed text dataset, where each entry is the feature vector.
+        relHot -- multihot matrix of the like_id. Rows are indexed with userid
     """
     # Get raw data
     df = pd.read_csv(input_dir)