In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import pickle

## 01 - Preprocess the data

* Likes Data
* Profile Data

### 1.1) - Import the 'LIKES' from the training dataset and process

Import the "likes" data

In [2]:
likes = pd.read_csv("/home/jamster/old-repos/ml2018-projectDATA/tcss555/training/relation/relation.csv")
likes = likes.rename(columns={'Unnamed: 0': "data_id"}).set_index("data_id")
likes.head()

Unnamed: 0_level_0,userid,like_id
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,c6a9a43058c8cc8398ca6e97324c0fae,8628204013
1,c6a9a43058c8cc8398ca6e97324c0fae,19040608370
2,c6a9a43058c8cc8398ca6e97324c0fae,24054583527
3,c6a9a43058c8cc8398ca6e97324c0fae,26726417043
4,c6a9a43058c8cc8398ca6e97324c0fae,31796936730


Extact individual columns and convert to lists

In [3]:
likesUIDs = likes['userid'].values
likesLIDs = likes['like_id'].values
lsLikesUIDs = likesUIDs.tolist()
lsLikesLIDs = likesLIDs.tolist()

Convert columns to sets

In [4]:
setLikesUIDs = set(lsLikesUIDs)
setLikesLIDs = set(lsLikesLIDs)

Convert columns to list of unique items

In [5]:
unqLikesUIDs = (list(setLikesUIDs))
unqLikesLIDs = (list(setLikesLIDs))

Get list of all User IDs (UIDs) paried with the Like IDs (LIDs) of the posts the user has liked

In [6]:
allLikesLS = [lsLikesUIDs, [str(x) for x in lsLikesLIDs]]
allLikesLS = list(map(list, zip(*allLikesLS)))

Convert list of UID and LID pairs into a dictionary indexed by UIDs

In [7]:
aDictLikes2 = {}
for aUID in unqLikesUIDs:
	aDictLikes2[aUID]=[]

for row in allLikesLS:
	aDictLikes2[row[0]].append(row[1])

Convert into a dictionary (by UIDs) of dictionaries (by LIDs)

In [8]:
combDICT = {}
for uid in unqLikesUIDs:
	tmpDICT={}
	tmpLS = aDictLikes2[uid]
	for row in tmpLS:
		tmpDICT[str(row)]=1
	combDICT[uid]=tmpDICT

Convert 'combDICT' into a list of dictionaries (of LIDs)

In [9]:
tryTHIS=[]
for uid in unqLikesUIDs:
	tryTHIS.append(combDICT[uid])

Vectorize the list of dictionaries in 'tryTHIS' to get the UID/LID matrix for the training data

In [10]:
v = DictVectorizer()
likesMAT=v.fit_transform(tryTHIS)

Export transformed likes data

In [11]:
pickle.dump(likesMAT, open("likesMAT.pkl","wb"))

### 1.2) - Import the profiles from the training dataset and process

For the profile aspects:

* ages
* sexes
* psychological traits

With the psychological traits being:
* openness (*ope*)
* concienciousness (*con*)
* ??? (*ext*)
* agreeableness (*agr*)
* neutrality (*neu*)

Import the profiles data

In [12]:
profilesDF=pd.read_csv("/home/jamster/old-repos/ml2018-projectDATA/tcss555/training/profile/profile.csv")
profilesDF = profilesDF.rename(columns={"Unnamed: 0": "data_id"}).set_index("data_id")
profilesDF.head()

Unnamed: 0_level_0,userid,age,gender,ope,con,ext,agr,neu
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7267f43c71fcf53f4580fd3cd808bd48,26.0,0.0,4.0,2.25,2.2,3.6,2.8
1,e1cdac10d136e76e064e4860007d786d,24.0,0.0,4.2,3.35,2.45,3.9,2.6
2,ea4b8b534a35f59713f61f29b8725d09,27.0,1.0,2.75,4.8,3.15,3.85,4.15
3,c7bb07b4b9ccd28cd0bc285194da5c72,29.0,1.0,3.4,3.2,2.1,3.7,2.35
4,e8b8f7f4f40ea6babfa9a2a967ff1866,25.0,0.0,4.4,3.25,4.0,2.89,2.35


Get the values of the relevant columns and convert them to a list

In [13]:
profiles=profilesDF[['userid', 'age', 'gender', 'ope', 'con', 'ext', 'agr', 'neu']].values.copy()
profilesLSo=profiles.tolist().copy()

Categorize the ages

In [14]:
profilesLS=[]
for row in profilesLSo:
	tmpLS=row
	tmpAGE=row[1]

	if tmpAGE < 25:
		tmpLS[1]=1
	elif tmpAGE < 35:
		tmpLS[1]=2
	elif tmpAGE < 50:
		tmpLS[1]=3
	else:
		tmpLS[1]=4

	profilesLS.append(tmpLS)

 Align the profiles data with the indexing of the likes data

In [15]:
profsTOlikes=[]
for i in range(len(profilesLS)):
	profsTOlikes.append([])

for row in profilesLS:
	tmpIND = unqLikesUIDs.index(row[0])
	profsTOlikes[tmpIND]=row

profsTOlikes1=list(map(list, zip(*profsTOlikes)))

Export the transformed profiles data for possible later use

In [19]:
pickle.dump(profsTOlikes1, open("profsTOlikes1.pkl","wb"))

#### 1.2.1) Extract Data for AGEs

In [20]:
agesARRo=np.array(profsTOlikes1[1])
agesARRo=agesARRo.tolist()

Convert data for AGEs to binary vectors

In [21]:
agesARR = []
for row in agesARRo:
	if row==1:
		agesARR.append([1,0,0,0])
	elif row==2:
		agesARR.append([0,1,0,0])
	elif row==3:
		agesARR.append([0,0,1,0])
	else:
		agesARR.append([0,0,0,1])

agesARR=np.array(agesARR)

Export the transformed ages data

In [22]:
pickle.dump(agesARR, open("agesARR.pkl","wb"))

#### 1.2.2) Extract Data for SEXes

In [23]:
sexsARR=np.array(profsTOlikes1[2])

Export the transformed sexes data

In [24]:
pickle.dump(sexsARR, open("sexsARR.pkl","wb"))