In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import math
import re
import time
import gc

In [2]:
start_time = time.time()
df1 = pd.read_csv('NetflixPrizeData/combined_data_1_v2.csv', header = None, names = ['CustomerID', 'Rating','MovieID'], usecols = [0,1,3])
df2 = pd.read_csv('NetflixPrizeData/combined_data_2_v2.csv', header = None, names = ['CustomerID', 'Rating','MovieID'], usecols = [0,1,3])
df3 = pd.read_csv('NetflixPrizeData/combined_data_3_v2.csv', header = None, names = ['CustomerID', 'Rating','MovieID'], usecols = [0,1,3])
df4 = pd.read_csv('NetflixPrizeData/combined_data_4_v2.csv', header = None, names = ['CustomerID', 'Rating','MovieID'], usecols = [0,1,3])

df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

print('Dataset 1 shape: {}'.format(df1.shape))
print('-Dataset examples-')
print(df1.iloc[::5000000, :])

print('Time taken:'+str((time.time() - start_time))+' seconds')  

Dataset 1 shape: (24053764, 3)
-Dataset examples-
          CustomerID  Rating  MovieID
0            1488844     3.0        1
5000000       501954     2.0      996
10000000      404654     5.0     1962
15000000      886608     2.0     2876
20000000     1193835     2.0     3825
Time taken:28.31346893310547 seconds


In [3]:
# load less data for speed
start_time = time.time()

df = df1
df = df1.append(df2)
df = df.append(df3)
df = df.append(df4)

df.index = np.arange(0,len(df))
print('Full dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::5000000, :])

print('Time taken:'+str((time.time() - start_time))+' seconds')  

Full dataset shape: (100480507, 3)
-Dataset examples-
           CustomerID  Rating  MovieID
0             1488844     3.0        1
5000000        501954     2.0      996
10000000       404654     5.0     1962
15000000       886608     2.0     2876
20000000      1193835     2.0     3825
25000000      1899206     3.0     4661
30000000       154804     4.0     5496
35000000      2078749     5.0     6274
40000000       450763     5.0     7057
45000000       102092     3.0     7991
50000000       220298     5.0     9023
55000000       550530     5.0    10042
60000000       222570     3.0    11038
65000000      1273080     5.0    11875
70000000      2026970     5.0    12676
75000000       506044     4.0    13582
80000000       353605     2.0    14453
85000000       664606     3.0    15116
90000000      2213715     3.0    16008
95000000      1589401     5.0    16879
100000000     2314006     4.0    17627
Time taken:27.63409996032715 seconds


In [4]:
start_time = time.time()

f = ['count','mean']

df_movie_summary = df.groupby('MovieID')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('CustomerID')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

print('Time taken:'+str((time.time() - start_time))+' seconds')  

Movie minimum times of review: 1948.0
Customer minimum times of review: 211.0
Time taken:28.476064920425415 seconds


In [5]:
start_time = time.time()

print('Original Shape: {}'.format(df.shape))
df = df[~df['MovieID'].isin(drop_movie_list)]
df = df[~df['CustomerID'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))
print('-Data Examples-')
print(df.iloc[::5000000, :])

print('Time taken:'+str((time.time() - start_time))+' seconds')  

Original Shape: (100480507, 3)
After Trim Shape: (71833509, 3)
-Data Examples-
          CustomerID  Rating  MovieID
693           712664     5.0        3
6957956      1973032     4.0     1395
13899167      412139     5.0     2660
20822622     1503396     4.0     3925
27783299     2417320     2.0     5121
34824490     2551271     5.0     6240
41865304     2406150     4.0     7399
48683880     1305391     2.0     8782
55541104      528496     3.0    10158
62639089      599678     2.0    11376
69642938      964493     5.0    12612
76727431      829466     5.0    13923
83750446     2255251     4.0    14953
90844412     1097827     4.0    16169
97921470     1463885     5.0    17321
Time taken:68.24150514602661 seconds


In [6]:
start_time = time.time()

# Sample 100000 (1 million) rows
sample_50_df = df.sample(n = 1000000) 
sample_50_df.insert(3, 'fill', 1)
sample_50_df.sort_values(by=['CustomerID','MovieID'], inplace=True)
# copy.set_index(['CustomerID'], inplace=True)
# copy.sort_index(inplace=True)
# copy.reset_index(inplace=True)
sample_50_df

print('Time taken:'+str((time.time() - start_time))+' seconds')  

Time taken:7.100744009017944 seconds


In [7]:
sample_50_df

Unnamed: 0,CustomerID,Rating,MovieID,fill
649632,6,5.0,175,1
5365839,6,3.0,1096,1
7029960,6,4.0,1406,1
14843182,6,4.0,2862,1
16309033,6,1.0,3151,1
...,...,...,...,...
13213316,2649429,3.0,2499,1
37780743,2649429,5.0,6720,1
57603045,2649429,4.0,10550,1
68012972,2649429,5.0,12343,1


In [8]:
print(len(df['CustomerID'].unique()))
print(len(sample_50_df['CustomerID'].unique()))

144380
142458


In [9]:
print(len(df['MovieID'].unique()))
print(len(sample_50_df['MovieID'].unique()))

5332
5332


In [10]:
gb = sample_50_df.groupby(['CustomerID'])
result = gb['MovieID'].unique()
result = result.reset_index()
result

Unnamed: 0,CustomerID,MovieID
0,6,"[175, 1096, 1406, 2862, 3151, 7158, 7643, 1073..."
1,7,"[3905, 6350, 6971, 7230, 7586, 8105, 8254, 842..."
2,10,[11152]
3,79,"[1615, 1719, 2735, 3422, 3573, 3624, 8301, 868..."
4,97,"[3807, 11198, 12195, 12672]"
...,...,...
142453,2649370,"[3917, 7635, 11283, 16075]"
142454,2649378,"[3925, 11279, 11521]"
142455,2649388,"[357, 8954]"
142456,2649426,"[273, 4745, 5327, 5807, 9232, 14898, 15968]"


In [11]:
result.dtypes

CustomerID     int64
MovieID       object
dtype: object

In [12]:
feature_dict = dict(zip(result.CustomerID, result.MovieID))
feature_dict[6]

array([  175,  1096,  1406,  2862,  3151,  7158,  7643, 10730, 12513,
       13471, 16997])

In [13]:
import itertools 
	
out = dict(itertools.islice(feature_dict.items(), 5)) 
print("First 5 entries in dict : " + str(out)) 


First 5 entries in dict : {6: array([  175,  1096,  1406,  2862,  3151,  7158,  7643, 10730, 12513,
       13471, 16997]), 7: array([ 3905,  6350,  6971,  7230,  7586,  8105,  8254,  8428, 12293,
       16233, 16922]), 10: array([11152]), 79: array([ 1615,  1719,  2735,  3422,  3573,  3624,  8301,  8687, 12329,
       12902, 14648, 15818, 15871, 16930, 17302, 17697]), 97: array([ 3807, 11198, 12195, 12672])}


In [14]:
len(feature_dict.items())

142458

In [15]:
x = sample_50_df.groupby('Rating')['Rating'].agg(['count'])
x['percentage'] = (x['count'] / x['count'].sum()) * 100

print(x)
print(x.sum(axis = 0, skipna = True))

         count  percentage
Rating                    
1.0      40284      4.0284
2.0     100000     10.0000
3.0     299420     29.9420
4.0     341334     34.1334
5.0     218962     21.8962
count         1000000.0
percentage        100.0
dtype: float64


In [16]:
x = df.groupby('Rating')['Rating'].agg(['count'])

x['percentage'] = (x['count'] / x['count'].sum()) * 100

print(x)
print(x.sum(axis = 0, skipna = True))

           count  percentage
Rating                      
1.0      2908297    4.048663
2.0      7214331   10.043128
3.0     21468990   29.887152
4.0     24516837   34.130084
5.0     15725054   21.890973
count         71833509.0
percentage         100.0
dtype: float64


In [22]:
start_time = time.time()

#for numpy array
df_p_for_nparray = pd.pivot_table(sample_50_df,values='fill',index='CustomerID',columns='MovieID',fill_value=0.0)

print('Time taken:'+str((time.time() - start_time))+' seconds')  
df_p_for_nparray

MovieID,3,8,16,17,18,26,28,30,33,44,...,17741,17743,17751,17756,17758,17761,17762,17763,17764,17769
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
79,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2649388,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2649426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
np_array = df_p_for_nparray.to_numpy()
np_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
np.where(np_array == 1)

In [18]:
#for dict 
# finalllll
start_time = time.time()

# df_p_for_dict = pd.pivot_table(sample_50_df,values='Rating',index='MovieID',columns='CustomerID',fill_value=0.0)
df_p_for_dict = pd.pivot_table(sample_50_df,values='Rating',index='MovieID',columns='CustomerID')

print('Time taken:'+str((time.time() - start_time))+' seconds')  
df_p_for_dict

CustomerID,6,7,10,79,97,134,169,188,195,199,...,2649299,2649308,2649328,2649331,2649336,2649370,2649378,2649388,2649426,2649429
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17761,,,,,,,,,,,...,,,,,,,,,,
17762,,,,,,,,,,,...,,,,,,,,,,
17763,,,,,,,,,,,...,,,,,,,,,,
17764,,,,,,,,,,,...,,,,,,,,,,


In [27]:
# Old values with 0 in df
df_p_for_dict.mean(axis=1)

MovieID
3        0.000611
8        0.002793
16       0.000856
17       0.001095
18       0.003615
           ...   
17761    0.001109
17762    0.010198
17763    0.000625
17764    0.019933
17769    0.001228
Length: 5332, dtype: float64

In [19]:
start_time = time.time()

df_p_copy = df_p_for_dict.copy()
# new_dict = df_p_for_dict.replace(0, np.NaN)

customer_means = df_p_copy.mean()
movie_means = df_p_copy.mean(axis=1)
print(customer_means)
print(movie_means)
movie_means_dict = movie_means.to_dict()
print(movie_means_dict)
print('Time taken:'+str((time.time() - start_time))+' seconds')  

CustomerID
6          3.454545
7          3.818182
10         4.000000
79         3.125000
97         4.000000
             ...   
2649370    3.250000
2649378    4.000000
2649388    3.000000
2649426    4.285714
2649429    4.500000
Length: 142458, dtype: float64
MovieID
3        3.600000
8        2.874016
16       3.066667
17       2.870370
18       3.762821
           ...   
17761    2.833333
17762    3.572519
17763    3.550000
17764    3.842801
17769    2.717647
Length: 5332, dtype: float64


In [37]:
movie_means_dict = movie_means.to_dict()
print(movie_means_dict)

2.875, 15155: 4.0, 15156: 3.5912322274881516, 15158: 2.977272727272727, 15160: 3.4878048780487805, 15163: 3.659217877094972, 15164: 4.030769230769231, 15170: 2.8333333333333335, 15171: 3.0275229357798166, 15177: 3.2950819672131146, 15181: 3.265432098765432, 15182: 3.2298850574712645, 15183: 4.204819277108434, 15186: 3.0294117647058822, 15200: 3.308411214953271, 15205: 3.3853503184713376, 15209: 3.8312958435207825, 15211: 3.3846153846153846, 15223: 3.048, 15224: 2.9615384615384617, 15233: 3.1710296684118675, 15234: 2.96875, 15237: 2.875, 15243: 3.142857142857143, 15244: 2.672727272727273, 15246: 3.663316582914573, 15256: 3.0256410256410255, 15257: 3.5555555555555554, 15260: 2.230769230769231, 15267: 4.090909090909091, 15272: 2.8035714285714284, 15276: 3.769230769230769, 15282: 3.6666666666666665, 15285: 3.033333333333333, 15289: 2.738095238095238, 15294: 4.222222222222222, 15296: 4.557951482479784, 15300: 3.189189189189189, 15305: 3.838709677419355, 15307: 4.3578947368421055, 15309: 3.1

In [30]:
sub_df = df_p_copy.sub(movie_means, axis=0) 
sub_df

CustomerID,6,7,10,79,97,134,169,188,195,199,...,2649299,2649308,2649328,2649331,2649336,2649370,2649378,2649388,2649426,2649429
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17761,,,,,,,,,,,...,,,,,,,,,,
17762,,,,,,,,,,,...,,,,,,,,,,
17763,,,,,,,,,,,...,,,,,,,,,,
17764,,,,,,,,,,,...,,,,,,,,,,


In [31]:
sub_df = sub_df.transpose()
sub_df

MovieID,3,8,16,17,18,26,28,30,33,44,...,17741,17743,17751,17756,17758,17761,17762,17763,17764,17769
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
79,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649370,,,,,,,,,,,...,,,,,,,,,,
2649378,,,,,,,,,,,...,,,,,,,,,,
2649388,,,,,,,,,,,...,,,,,,,,,,
2649426,,,,,,,,,,,...,,,,,,,,,,


In [34]:
sub_df.fillna(value=movie_means, inplace= True)

In [36]:
sub_df

MovieID,3,8,16,17,18,26,28,30,33,44,...,17741,17743,17751,17756,17758,17761,17762,17763,17764,17769
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
7,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
10,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
79,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
97,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649370,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
2649378,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
2649388,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647
2649426,3.6,2.874016,3.066667,2.87037,3.762821,2.787234,3.85618,3.73494,4.189873,3.689655,...,3.15,3.014925,3.655172,3.77381,2.428571,2.833333,3.572519,3.55,3.842801,2.717647


In [None]:
sample = sub_df.sample(n = 50) 
sample = sample.sample(n = 50, axis =1) 
sample

In [99]:
f = sub_df.replace(np.nan, movie_means)
f
for index in sub_df.index:
    sub_df.loc[index,10] = movie_means[index]

CustomerID,6,7,10,79,97,116,134,169,188,195,...,2649308,2649328,2649331,2649335,2649336,2649370,2649378,2649388,2649426,2649429
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
8,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
16,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
17,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
18,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17761,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
17762,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
17763,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,
17764,,,,3.75,3.854167,,,,2.590909,,...,,,,,,,,,,


In [None]:
df_dict = df_p_for_dict.to_dict()
df_dict 