In [1]:
from lenskit.datasets import ML100K
from transform_data_representation import transform_dense_to_sparse_data
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import os

Required directory 'C:\Users\Jorane Rogier\Documents\studie\year2\Research Internship\RecSysProject/output/' exists 
Required directory 'C:\Users\Jorane Rogier\Documents\studie\year2\Research Internship\RecSysProject/output//synthetic_data/' exists 
Required directory 'C:\Users\Jorane Rogier\Documents\studie\year2\Research Internship\RecSysProject/output//partitioned_data/' exists 


In [2]:
# Load input data
ml100k = ML100K('ml-100k')
ratings = ml100k.ratings
ratings = ratings[['user', 'item', 'rating']]
user_item_matrix = ratings.pivot(*ratings.columns)
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix.columns = user_item_matrix.columns.astype(str)
df = pd.DataFrame(user_item_matrix)

In [3]:
# Create Global Item Ranking (GIR), looping over all items, count how often the item is watched
GIR = {}
for col in df:
    # does not take into account movies that have not been watched
    summed_rating = df[col].sum()

    # only take into account items that have been rated more than 5 times
    if (df[col].astype(bool).sum(axis=0) > 100):
        GIR[col] = summed_rating

ranks_GIR = {k: v for k,v in sorted(GIR.items(), key=lambda item: item[1], reverse=True)}
ranks_GIR_items = [*ranks_GIR]
print(len(ranks_GIR_items))

334


In [28]:
# Compute user mainstreaminess (UM)
user_mainstreamnesses = {}
taus = []
for uid in range(len(df)):
    user_dict = {}
    for item_id in df.columns:
        rating = df.iloc[uid][item_id]
        # only take into account items that have been rated > 5 times
        if item_id in GIR:
            user_dict[item_id] = rating
    ranks_user = {k: v for k,v in sorted(user_dict.items(), key=lambda item: item[1], reverse=True)}
    ranks_user_items = [*ranks_user]
    # compute the mainstreaminess measure with Kendall's rank-order correlation
    tau, p_value = stats.kendalltau(ranks_GIR_items, ranks_user_items)
    user_mainstreamnesses[uid] = [tau, p_value]
    taus.append(round(tau, 3))
    pass


### Divide users into two groups, based on mainstreaminess score, and compare group-size characteristics

First test with cut-off points at tau = 0.02

In [15]:
# Get the user-ids which have tau < 0.02 (beyond mainstream users, bmu)
bmu = [k for k, v in user_mainstreamnesses.items() if float(v[0]) < 0.02]
print(len(bmu))

# Get the user-ids which have tau >= 0.02 (mainstream users, mu)
mu = [k for k, v in user_mainstreamnesses.items() if float(v[0]) >= 0.02]
print(len(mu))

273
670


In [35]:
df_bmu = df.iloc[bmu]
df_mu = df.iloc[mu]

Then, compare the characteristics.

In [42]:
# compute non-zero values per row for both dataframes
df_bmu['total'] = df.gt(0).sum(axis=1)
df_mu['total'] = df.gt(0).sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bmu['total'] = df.gt(0).sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mu['total'] = df.gt(0).sum(axis=1)


In [50]:
print(df_mu['total'].min())
print(df_mu['total'].max())
print(df_mu['total'].mean())

20
685
97.27611940298507


In [49]:
print(df_bmu['total'].min())
print(df_bmu['total'].max())
print(df_bmu['total'].mean())

20
737
127.56410256410257


In [59]:
data = {'Characteristic': ['Min. #Ratings', 'Max. #Ratings', 'Mean #Ratings'],
                'Beyond-mainstream': [df_mu['total'].min(), df_mu['total'].max(), df_mu['total'].mean()],
                'Mainstream': [df_bmu['total'].min(), df_bmu['total'].max(), df_bmu['total'].mean()]}

compare_df = pd.DataFrame(data)
print(compare_df)

  Characteristic  Beyond-mainstream  Mainstream
0  Min. #Ratings          20.000000   20.000000
1  Max. #Ratings         685.000000  737.000000
2  Mean #Ratings          97.276119  127.564103
