### Introduction

__Les scripts développés, stockés dans un système de gestion de version__ (Git en local avec push sur Github) permettant le déploiement de l’application de bout-en-bout.

- Ce livrable vous servira à présenter le caractère “industrialisable” de votre travail.

### import

In [1]:
import os
import numpy as np
import pandas as pd

globo_source = "../data/source/news-portal-user-interactions-by-globocom/"
data_source = "../data/source/"
data_cleaned = "../data/cleaned/"
data_target = "../irfantoor-recommend/recommend/data/"

apply_pca = True
pca_components = 100 # originally in the embedding : 250

# Préparer les données

In [2]:
# item_metadata_nunique

item_metadata = pd.read_csv(os.path.join(globo_source, 'articles_metadata.csv'))
item_metadata.rename(columns={"article_id": "item", "category_id": "group"}, inplace=True)
item_metadat_nunique = item_metadata.nunique()
item_metadat_nunique.to_csv(os.path.join(data_cleaned, 'item_metadata_nunique.csv'))

In [3]:
# users_clicks : ∑ clicks_hour_{i}.csv, where i=000 to 384

import os

# initialize
data_path = os.path.join(globo_source, 'clicks')
file_ids = range(385)
clicks = []

# read all files
for i in file_ids:
    file = f'clicks_hour_%03d.csv'%(i)
    file_path = os.path.join(data_path, file)
    ds = pd.read_csv(file_path)

    for r in ds.to_numpy():
        clicks.append(r)

# convert to DataFrame
clicks = np.array(clicks)
users_clicks = pd.DataFrame(clicks, columns=ds.columns)
users_clicks = users_clicks[['user_id', 'click_article_id', 'click_timestamp']].rename(columns={"user_id":"user", "click_article_id":"item", "click_timestamp":"timestamp"})

users_clicks

Unnamed: 0,user,item,timestamp
0,0,157541,1506826828020
1,0,68866,1506826858020
2,1,235840,1506827017951
3,1,96663,1506827047951
4,2,119592,1506827090575
...,...,...,...
2988176,10051,84911,1508211557302
2988177,322896,30760,1508211672520
2988178,322896,157507,1508211702520
2988179,123718,234481,1508211513583


In [4]:
# item_clicks.csv

item_clicks = users_clicks.groupby('item').count().sort_values(by='user', ascending=False)['user']
item_clicks = pd.DataFrame(
    {
        'items': item_clicks.keys(),
        'clicks': item_clicks.values,
    }
)
item_clicks.to_csv(os.path.join(data_cleaned, 'item_clicks.csv'), index=False)

item_clicks[:5]

Unnamed: 0,items,clicks
0,160974,37213
1,272143,28943
2,336221,23851
3,234698,23499
4,123909,23122


In [5]:
users_clicks.join(item_metadata.set_index('item'), on='item')

Unnamed: 0,user,item,timestamp,group,created_at_ts,publisher_id,words_count
0,0,157541,1506826828020,281,1506800518000,0,280
1,0,68866,1506826858020,136,1506816482000,0,226
2,1,235840,1506827017951,375,1506807839000,0,159
3,1,96663,1506827047951,209,1506788025000,0,206
4,2,119592,1506827090575,247,1506784316000,0,239
...,...,...,...,...,...,...,...
2988176,10051,84911,1508211557302,174,1508187619000,0,171
2988177,322896,30760,1508211672520,26,1508185091000,0,162
2988178,322896,157507,1508211702520,281,1508236945000,0,370
2988179,123718,234481,1508211513583,375,1508181572000,0,212


In [6]:
# user_interactions_with_groups.csv

user_interactions_with_groups = users_clicks.join(item_metadata.set_index('item'), on='item')[['user', 'item', 'timestamp', 'group']]
user_interactions_with_groups.to_csv(
    os.path.join(data_cleaned, 'user_interactions_with_groups.csv'),
    index=False
)
user_interactions_with_groups

Unnamed: 0,user,item,timestamp,group
0,0,157541,1506826828020,281
1,0,68866,1506826858020,136
2,1,235840,1506827017951,375
3,1,96663,1506827047951,209
4,2,119592,1506827090575,247
...,...,...,...,...
2988176,10051,84911,1508211557302,174
2988177,322896,30760,1508211672520,26
2988178,322896,157507,1508211702520,281
2988179,123718,234481,1508211513583,375


In [7]:
# user_interactions_nunique.csv

user_interactions_nunique = user_interactions_with_groups.nunique()
user_interactions_nunique.to_csv(os.path.join(data_cleaned, 'user_interactions_nunique.csv'))
user_interactions_nunique

user          322897
item           46033
timestamp    2983198
group            316
dtype: int64

In [8]:
# group_clicks.csv

group_clicks = user_interactions_with_groups[['group', 'item']].groupby(by='group').count().sort_values(by='item', ascending=False)['item']
group_clicks = pd.DataFrame(
    {
        'group': group_clicks.keys(),
        'clicks': group_clicks.values
    }
)
group_clicks.to_csv(os.path.join(data_cleaned, 'group_clicks.csv'), index=False)
group_clicks

Unnamed: 0,group,clicks
0,281,370843
1,375,268257
2,412,178894
3,437,157085
4,250,140454
...,...,...
311,58,1
312,370,1
313,367,1
314,363,1


In [9]:
# group_items.csv

item_metadata[['group', 'item']].to_csv(os.path.join(data_cleaned, 'group_items.csv'), index=False)

In [10]:
# item_features.csv

item_features = pd.DataFrame(
    pd.read_pickle(os.path.join(globo_source, 'articles_embeddings.pickle'))
)
item_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.286870,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.397090,-0.834364
2,-0.619619,-0.972960,-0.207360,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.447580,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.036040,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.614450,-0.917760,-0.424061,0.185484,-0.580292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364042,-0.055038,-0.962136,0.869436,-0.071523,-0.725294,0.434320,0.198312,-0.581154,0.702346,-0.124813,...,-0.410549,0.564252,-0.463959,0.167907,-0.480068,0.652090,0.380880,0.433195,-0.662455,-0.222850
364043,-0.136932,-0.995471,0.991298,0.031871,-0.915622,-0.658517,0.633090,-0.564356,0.676551,-0.446068,...,-0.681986,-0.574185,-0.536908,0.688934,0.528204,0.162435,0.940364,0.989298,-0.761595,-0.414652
364044,-0.251390,-0.976243,0.586097,0.643631,-0.663359,-0.093480,0.691553,-0.588281,0.902999,0.124571,...,-0.162220,-0.242030,-0.476131,0.352132,-0.311279,0.460574,-0.653077,-0.143725,0.068093,-0.705010
364045,0.224342,-0.923288,-0.381742,0.687890,-0.773911,-0.103629,-0.406486,0.246004,0.255191,-0.329587,...,-0.422999,0.390324,0.655911,-0.646753,-0.174031,0.698037,-0.317102,0.687132,-0.531512,0.010726


In [11]:
# from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

if apply_pca:
    # intialize pca and logistic regression model
    pca = PCA(n_components=pca_components)
    # lr = LogisticRegression(multi_class='auto', solver='liblinear')

    # fit and transform data
    sc = StandardScaler()
    X, y = item_features.iloc[:, 1:].values, item_features.iloc[:, 0].values

    X_std = sc.fit_transform(X)

    X_pca = pca.fit_transform(X_std)
    X_pca.shape

    item_features = pd.DataFrame(X_pca)
    # lr.fit(X_pca, y)

item_features.shape

(364047, 100)

In [12]:
item_features.to_csv(os.path.join(data_cleaned, 'item_features.csv'), index=False)
item_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.862672,5.723353,1.295251,3.760629,-1.339288,0.258499,0.499738,0.234135,-0.833622,-1.530625,...,0.083527,0.314536,0.035763,-0.142197,-0.168138,-0.339069,-0.252711,-0.302009,0.116992,0.005792
1,-5.355302,-4.158480,-2.193896,3.679254,0.275466,3.709864,2.094987,-0.620395,2.168455,-3.031487,...,0.168414,-0.145401,-0.164960,-0.043074,-0.401082,0.247115,0.354626,0.515652,-0.226743,-0.224340
2,-3.477614,-0.217643,-6.067393,-0.929503,-0.185815,4.586033,-1.833935,-2.666209,-2.394186,-2.719801,...,0.151395,-0.254909,0.336824,-0.087771,0.013096,-0.310162,-0.100069,0.370328,-0.104044,0.269899
3,-1.148532,-7.751267,-5.384951,1.434982,-2.380125,-1.732646,0.744800,-1.535889,2.032019,-3.908678,...,-0.191930,-0.166317,0.141136,0.116055,-0.318686,-0.105910,-0.081072,0.132329,-0.094210,0.024787
4,-1.188575,-2.703682,-2.738934,0.263688,-2.914537,5.661136,0.197531,0.140514,-1.332735,-2.037284,...,-0.060031,0.138981,-0.110404,0.116517,-0.485129,-0.006179,0.006372,0.302123,-0.096474,0.069899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364042,1.671414,-6.919379,8.900601,-0.865235,1.575611,-1.744921,1.657069,1.368295,0.470783,-1.859085,...,0.050297,-0.009907,0.048047,0.046406,0.183309,-0.156443,0.000946,-0.149448,0.083897,0.013248
364043,0.918237,-1.242474,6.571383,3.312153,14.442499,3.125157,1.647155,3.716390,-0.674086,-1.645005,...,-0.200897,-0.155292,0.842660,-0.274838,-0.133272,0.685077,0.278995,0.152086,0.001707,0.396890
364044,0.980561,-9.341313,3.667895,3.047872,2.154193,0.479300,-2.441297,1.294296,2.929951,4.246388,...,-0.164372,-0.120039,0.351074,0.199175,-0.326996,0.126648,0.330260,-0.197747,0.159638,0.145999
364045,11.418889,-1.805978,0.067951,-0.604595,-2.730290,-2.045381,-2.689163,1.538153,0.353602,-2.158859,...,0.262621,-0.301809,0.058418,-0.132688,0.102299,-0.020031,0.135164,0.240501,-0.078720,0.184582


In [13]:
# item_features_shape.csv

item_features_shape = pd.DataFrame(
    {
        'shape':item_features.shape
    }
)

item_features_shape.to_csv(os.path.join(data_cleaned, 'item_features_shape.csv'), index=False)
item_features_shape

Unnamed: 0,shape
0,364047
1,100


In [14]:
# top_group_clicks.csv

top_group_clicks = group_clicks[group_clicks['clicks']>=100000]
top_group_clicks.to_csv(
    os.path.join(data_cleaned, 'top_group_clicks.csv'),
    index=False
)
top_group_clicks

Unnamed: 0,group,clicks
0,281,370843
1,375,268257
2,412,178894
3,437,157085
4,250,140454
5,331,115901
6,399,104464


In [15]:
# top_item_clicks.csv

top_item_clicks = item_clicks[item_clicks['clicks']>=10000]
top_item_clicks.to_csv(
    os.path.join(data_cleaned, 'top_item_clicks.csv'),
    index=False
)
top_item_clicks

Unnamed: 0,items,clicks
0,160974,37213
1,272143,28943
2,336221,23851
3,234698,23499
4,123909,23122
5,336223,21855
6,96210,21577
7,162655,21062
8,183176,20303
9,168623,19526


# Copier les fichier dans azure-recommend

In [16]:
import shutil

source_list = [
    [data_source, '100k', '100k.txt'],
    
    [globo_source, '', 'articles_embeddings.pickle'],
    [globo_source, '', 'articles_metadata.csv'],

    [data_cleaned, '', 'group_clicks.csv'],
    [data_cleaned, '', 'group_items.csv'],
    [data_cleaned, '', 'item_clicks.csv'],
    [data_cleaned, '', 'item_features_shape.csv'],
    [data_cleaned, '', 'item_features.csv'],
    [data_cleaned, '', 'item_metadata_nunique.csv'],
    [data_cleaned, '', 'top_group_clicks.csv'],
    [data_cleaned, '', 'top_item_clicks.csv'],
    [data_cleaned, '', 'user_interactions_nunique.csv'],
    [data_cleaned, '', 'user_interactions_with_groups.csv'],
]

for item in source_list:
    src = os.path.join(item[0], item[1], item[2])
    dst = os.path.join(data_target, item[2])
    if not os.path.exists(dst):
        print(f"creating file: {dst}")
    else:
        print(f"file: {dst}, already exists, overwriting")


file: ../irfantoor-recommend/recommend/data/100k.txt, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/articles_embeddings.pickle, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/articles_metadata.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/group_clicks.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/group_items.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/item_clicks.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/item_features_shape.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/item_features.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/item_metadata_nunique.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/top_group_clicks.csv, already exists, overwriting
file: ../irfantoor-recommend/recommend/data/top_item_clicks.csv, already e

In [17]:
# summary.json

import json

n_items = item_metadat_nunique['item']
n_clicked_items = user_interactions_nunique['item']
n_groups = item_metadat_nunique['group']
n_clicked_groups = user_interactions_nunique['group']

summary = {
    "n_items": int(n_items),
    "n_groups": int(n_groups),
    "n_features": int(item_features.shape[1]),
    "n_users": int(user_interactions_nunique['user']),
    "n_clicked_items": int(n_clicked_items),
    "n_clicked_items_percent": f"%2.2f%%"%(n_clicked_items/n_items*100),
    "n_clicked_groups": int(n_clicked_groups),
    "n_clicked_groups_percent": f"%2.2f%%"%(n_clicked_groups/n_groups*100),
}

with open(os.path.join(data_target, "summary.json"), "w+") as fp:
    json.dump(summary, fp)

summary

{'n_items': 364047,
 'n_groups': 461,
 'n_features': 100,
 'n_users': 322897,
 'n_clicked_items': 46033,
 'n_clicked_items_percent': '12.64%',
 'n_clicked_groups': 316,
 'n_clicked_groups_percent': '68.55%'}