In [1]:
import pandas as pd
import numpy as np 
from scipy import sparse

In [2]:
def abrir_e_criar_lista(diretorio, separador, coluna):
    variavel = open(diretorio, 'r', encoding='iso-8859-1')

    lista = []

    for linha in variavel:
        linha = linha.split(separador)
        lista.append(linha[coluna]) # adicionar na lista a coluna que eu escolher

    return lista

In [3]:
userID_data = abrir_e_criar_lista('/home/laryssacosta/workspace/python-warmup/ml-100k/u.data', '\t', 0) # coluna 0 é o userID
movieID_data = abrir_e_criar_lista('/home/laryssacosta/workspace/python-warmup/ml-100k/u.data', '\t', 1)
ratings_data = abrir_e_criar_lista('/home/laryssacosta/workspace/python-warmup/ml-100k/u.data', '\t', 2)

In [4]:
array_data = np.array([userID_data, movieID_data, ratings_data])

In [5]:
array_data.shape

(3, 100000)

In [6]:
df_data = pd.DataFrame(data=array_data) # ver dados com pandas
df_data = df_data.T # para transpor
df_data.columns = 'userID movieID rating'.split() # mudar nomes das colunas
df_data

Unnamed: 0,userID,movieID,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [7]:
movieID_unicos = abrir_e_criar_lista('/home/laryssacosta/workspace/python-warmup/ml-100k/u.item', '|', 0)
titulo_movie = abrir_e_criar_lista('/home/laryssacosta/workspace/python-warmup/ml-100k/u.item', '|', 1)
titulo_movie[:5]

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)']

In [8]:
array_item = np.array([movieID_unicos, titulo_movie]) # lista com ID e nome dos filmes
array_item

array([['1', '2', '3', ..., '1680', '1681', '1682'],
       ['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)']], dtype='<U81')

In [9]:
df_item = pd.DataFrame(data=array_item)
df_item = df_item.T
df_item.columns = 'movieID titulo'.split() # renomear colunas - tem que ser o mesmo titulo para poder dar o merge
df_item

Unnamed: 0,movieID,titulo
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [10]:
df_data_item = pd.merge(df_data, df_item) # para juntar os dados e criar uma nova coluna com o titulo
df_data_item

Unnamed: 0,userID,movieID,rating,titulo
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)
...,...,...,...,...
99995,840,1674,4,Mamma Roma (1962)
99996,655,1640,3,"Eighth Day, The (1996)"
99997,655,1637,3,Girls Town (1996)
99998,655,1630,3,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [11]:
df_data_item['rating'] = pd.to_numeric(df_data_item['rating'])

In [12]:
matriz_user_movie = df_data_item.pivot_table(values='rating', columns='titulo', index='userID') # criando a matriz

In [13]:
matriz_user_movie 

titulo,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
10,,,,5.0,,,,5.0,,4.0,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,,,3.0,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,5.0,,,4.0,3.0,,,...,,,,4.0,4.0,2.0,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,5.0,,,...,,,,5.0,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [14]:
#substituir NaN por 0
matriz_user_movie = matriz_user_movie.dropna(thresh=20, axis=1) # ter pelo menos 20 avaliações

In [15]:
matriz_user_movie.head()

titulo,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wishmaster (1997),With Honors (1994),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,5.0,,,3.0,4.0,,,,3.0,...,,,4.0,,5.0,,5.0,3.0,,
10,,5.0,,,,5.0,4.0,,,4.0,...,,,5.0,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,3.0,...,,,,,,,4.0,,,


In [17]:
matriz_user_movie = matriz_user_movie.fillna(0) # onde tem NaN, colocar 0
matriz_user_movie.head()

titulo,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wishmaster (1997),With Honors (1994),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,3.0,...,0.0,0.0,4.0,0.0,5.0,0.0,5.0,3.0,0.0,0.0
10,0.0,5.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,4.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [18]:
#zscore
# todos os usuarios estao no mesmo nível
def padronizacao(linha):
    nova_linha = (linha - linha.mean())/(linha.max() - linha.min())
    return nova_linha

In [19]:
matriz_user_movie_std = matriz_user_movie.apply(padronizacao) # aplicar a funçao que criamos

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

## Filtro colaborativo --> Item Item --> User User

- primeiro item item

In [21]:
item_similarity_cos_array = cosine_similarity(matriz_user_movie_std.T) # matriz padronizada transposta

In [22]:
pd.DataFrame(item_similarity_cos_array, columns = matriz_user_movie.columns, index = matriz_user_movie.columns) # linhas e colunas tem que ser os mesmos para comparar filme com filme

titulo,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wishmaster (1997),With Honors (1994),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
titulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),1.000000,0.059246,-0.001097,0.052877,0.128745,0.078260,0.015504,0.005750,0.221025,0.121153,...,0.013164,0.112220,0.124633,0.057741,0.074635,0.109054,0.155450,0.115195,0.039172,-0.005917
12 Angry Men (1957),0.059246,1.000000,-0.014343,0.066339,0.230274,0.298716,0.339195,0.174500,0.019790,0.156721,...,-0.037797,0.113281,0.287779,0.048741,0.170189,0.160140,0.290188,0.164959,0.079341,0.038111
187 (1997),-0.001097,-0.014343,1.000000,0.078770,-0.010333,-0.039939,-0.021414,-0.006246,0.127531,0.017270,...,0.229842,-0.021639,0.030288,0.084986,-0.025833,-0.000838,-0.021873,0.006811,0.053843,0.063789
2 Days in the Valley (1996),0.052877,0.066339,0.078770,1.000000,0.056283,0.090983,-0.019962,-0.008210,0.245191,0.129204,...,-0.029181,0.113570,0.069162,0.087568,0.028208,0.116497,0.061328,0.197620,0.176032,0.146777
"20,000 Leagues Under the Sea (1954)",0.128745,0.230274,-0.010333,0.056283,1.000000,0.384535,0.274525,0.118108,0.117512,0.231247,...,-0.008650,0.111092,0.338880,0.244088,0.101581,0.286848,0.309511,0.243306,0.057977,0.071111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyatt Earp (1994),0.109054,0.160140,-0.000838,0.116497,0.286848,0.183051,0.137189,0.041004,0.121793,0.166022,...,-0.018134,0.193267,0.246124,0.185584,0.048645,1.000000,0.242159,0.336002,0.202880,0.018951
Young Frankenstein (1974),0.155450,0.290188,-0.021873,0.061328,0.309511,0.429298,0.205543,0.170677,0.100104,0.204965,...,-0.062768,0.129930,0.444625,0.190200,0.313386,0.242159,1.000000,0.294564,0.188973,0.051973
Young Guns (1988),0.115195,0.164959,0.006811,0.197620,0.243306,0.168440,0.057132,0.035488,0.136609,0.310994,...,-0.043065,0.342480,0.244091,0.331909,0.090600,0.336002,0.294564,1.000000,0.599633,0.037991
Young Guns II (1990),0.039172,0.079341,0.053843,0.176032,0.057977,0.071705,0.025560,0.022169,0.135415,0.210445,...,-0.030984,0.310381,0.107492,0.229470,0.033696,0.202880,0.188973,0.599633,1.000000,0.012340


- user user

In [23]:
user_similarity_cos_array = cosine_similarity(matriz_user_movie_std)

In [25]:
user_similarity_cos = pd.DataFrame(data = user_similarity_cos_array, columns= matriz_user_movie_std.index, index = matriz_user_movie_std.index)

In [26]:
user_similarity_cos

userID,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.178006,-0.318752,-0.226223,0.132059,-0.214724,-0.137771,-0.339268,-0.011707,-0.365434,...,0.314323,0.012126,-0.273305,-0.116517,0.179387,0.286619,0.073220,0.060490,-0.238304,-0.012014
10,0.178006,1.000000,-0.229760,-0.300916,0.008036,-0.221145,-0.169165,-0.268118,0.024535,-0.249104,...,0.226584,0.094162,-0.248245,-0.029100,-0.035952,0.163921,0.095661,0.054333,-0.150090,-0.093750
100,-0.318752,-0.229760,1.000000,0.036681,-0.077775,0.112515,0.313202,0.600075,-0.000114,0.472718,...,-0.320118,0.121722,0.206765,0.147925,-0.256714,-0.339362,-0.138929,-0.131855,0.167461,-0.031098
101,-0.226223,-0.300916,0.036681,1.000000,-0.142673,0.312585,0.136985,0.126928,-0.054859,0.164268,...,-0.257524,-0.171581,0.271561,-0.068324,-0.018433,-0.193674,-0.084367,-0.092007,0.103540,0.132460
102,0.132059,0.008036,-0.077775,-0.142673,1.000000,-0.091156,-0.109980,-0.114632,-0.174891,-0.164364,...,0.178645,0.026833,-0.133853,-0.064358,0.140316,0.148880,-0.007165,0.036193,-0.074618,-0.003681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.286619,0.163921,-0.339362,-0.193674,0.148880,-0.208851,-0.247538,-0.417163,-0.029568,-0.423099,...,0.372986,0.027153,-0.310877,-0.025913,0.173934,1.000000,0.043117,0.059863,-0.236262,-0.017196
96,0.073220,0.095661,-0.138929,-0.084367,-0.007165,0.062305,-0.162895,-0.112220,-0.018528,-0.080525,...,0.017562,0.023318,-0.021039,0.048056,0.039918,0.043117,1.000000,0.126644,-0.005288,-0.068642
97,0.060490,0.054333,-0.131855,-0.092007,0.036193,-0.021983,-0.199783,-0.107597,-0.014802,-0.078239,...,-0.012960,0.056900,0.024477,0.041863,0.011986,0.059863,0.126644,1.000000,0.040461,-0.127295
98,-0.238304,-0.150090,0.167461,0.103540,-0.074618,0.180513,-0.036146,0.268451,0.189775,0.343039,...,-0.284980,-0.026047,0.213050,0.045268,-0.162336,-0.236262,-0.005288,0.040461,1.000000,-0.147770


- Correlação de Pearson

In [27]:
item_similarity_pearson = matriz_user_movie.corr(method='pearson')

In [28]:
user_similarity_pearson = matriz_user_movie.T
user_similarity_pearson = user_similarity_pearson.corr(method='pearson')

In [29]:
def get_similar_user(user_ID, rating):

    similar = user_similarity_pearson[user_ID]*rating
    similar = similar.sort_values(ascending=False)

    return similar.head()


In [30]:
user_1 = get_similar_user('1', 2)
user_1

userID
1      2.000000
738    0.877237
916    0.865305
823    0.823124
864    0.820653
Name: 1, dtype: float64