In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from time import time
import matplotlib.pyplot as plt
import random
import datetime

In [2]:
"""

Essentially this looks at reading the files and putting them into dataframes
which can be later be put into the Naive Bayes model

"""


#Reading users file:

u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', \
                    names=u_cols,encoding='latin-1')

#Reading ratings file:

r_cols = ['user_id','movie_id','rating','unix_timestamp']
rating = pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols,\
                     encoding='latin-1')

#Reading the movie file:

item_cols = ['movie_id','movie title','release date',
'video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller',
'War' ,'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|',names=item_cols,\
                   encoding='latin-1')

#Drops the columns beside movie_id,movie title and release date into a new dataframe

movies_prof = movies[['movie_id','movie title','release date']].copy()
movies_prof1 = movies_prof.copy()

#Turns the release date into the datatime format and then sorts it by release date

movies_prof1['release date'] = pd.to_datetime(movies_prof1['release date'])
movies_prof1 = movies_prof1.sort_values(['release date'])

In [3]:
"""

This looks at grouping together the users and the movies that they watched and rated.
It would stoore this in the dataframe user_movie and only stores the movies that the user watched with
a rating of more than or equal to a rating of 3

"""

df2  = rating.copy()

groups = df2.groupby("user_id", as_index=False)
k = 0
columns = ['user_id','age','sex','movie_id']
df_users = pd.DataFrame(columns=columns)
for group in groups:
    toy_stor = group
    toy_story = toy_stor[1].copy()
    users_rating = users.copy()
    df_1_toy = users.copy()

    user_movie = pd.merge(df_1_toy,toy_story)
    user_movie = user_movie.loc[user_movie['rating'] >= 3]
    user_movie = user_movie.drop(['unix_timestamp','rating','zip_code','occupation'],axis=1)
    
    df_users = df_users.append(user_movie)
        
    k+=1
    
"""
This part only looks at taking the first 564 users which will be used in the training set.
The rest of the users will be used in the test dataset.
"""

df_users = df_users.reset_index(drop=True)
size = df_users.loc[df_users['user_id'] < 564].shape[0]
df_users = df_users[:size].copy()
df_users.sex.replace(['M', 'F'], [1, 0], inplace=True)

In [4]:
'''

For this section we look at turning the users which is df_user into a training dataset that being the x. 
While the movie id would be the labels_train being the y

'''

labels_train = df_users[['movie_id','user_id']].copy()
labels_train = labels_train.set_index('user_id')
labels_train = pd.Series(df_users['movie_id'].values,\
                         index=df_users['user_id'])
labels_train = labels_train.rename("movie_id")
labels_train= labels_train.\
apply(pd.to_numeric, errors='coerce')
labels_train = labels_train.sort_values()
df_user = df_users.copy()
df_user = df_user.sort_values(by=['movie_id'])
df_user = df_user.drop(['movie_id'],axis=1)
df_user = df_user.set_index('user_id')

movies1 = labels_train.unique()
movies1 = movies1.tolist()

In [5]:
'''

This creates the Multinomial Naive Bayes model and trains it against the 563 user training set and the 
movies that they watched where they rated the movie with a 3 or higher.

'''
GNB = MultinomialNB()
t0 = time()
GNB.fit(df_user,labels_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Predictions
This goes about predicting the movies that users should watch using the Multinomial Naive Bayes Model.

In [6]:
genre = ['release date','video release date','IMDb URL','unknown','Action',\
            'Adventure','Animation','Childrens','Comedy','Crime',\
            'Documentary','Drama','Fantasy','Film-Noir','Horror',\
            'Musical','Mystery','Romance ','Sci-Fi','Thriller',\
            'War' ,'Western']
#valid_users = [564, 566, 567, 568, 569, 570, 571, 572, 573, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 586, 588, 589, 590, 591, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 638, 639, 640, 641, 643, 645, 646, 647, 649, 651, 652, 653, 654, 656, 657, 658, 659, 660, 661, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 677, 678, 679, 680, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 695, 696, 697, 698, 700, 701, 702, 703, 704, 706, 709, 710, 711, 712, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 725, 726, 728, 729, 730, 731, 732, 734, 735, 736, 737, 738, 740, 741, 742, 743, 744, 745, 746, 748, 749, 750, 752, 753, 754, 755, 757, 759, 760, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 777, 778, 779, 780, 781, 783, 784, 785, 786, 788, 789, 790, 791, 792, 794, 795, 796, 797, 800, 801, 802, 803, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 829, 830, 831, 832, 834, 836, 837, 838, 841, 843, 844, 848, 849, 850, 852, 853, 855, 856, 857, 858, 859, 860, 861, 862, 865, 866, 872, 873, 874, 875, 876, 877, 878, 879, 881, 884, 885, 886, 888, 889, 890, 891, 892, 893, 895, 897, 898, 899, 900, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 917, 919, 920, 922, 923, 924, 926, 927, 928, 929, 930, 931, 933, 934, 935, 937, 938, 939, 940, 941, 942, 943]

In [7]:
'''

This part looks at creating a list (valid_users) with the rest of the users (564 - 943)
but only if the movies that the rest of the users (564 - 943) looked at are in the movies that the
Multinomial Naive Bayes model was trained on based on users (1 - 564)
'''

groups = df2.groupby("user_id", as_index=False)
k=1
valid_users = []
jk = 1
for group in groups:
    if(jk > 563):
        toy_stor = group
        toy_story = toy_stor[1].copy()
        if(set(toy_story['movie_id'].tolist()).issubset(movies1) == False):
            k+=1
            continue
        else:
            valid_users.append(group[0])
    jk+=1

In [8]:
'''

This section of the program looks at calculating the short term and long term profitability
of the movies recommended to a single user in the valid_users list. It will print the movies based 
upon Probability, Short Term profitability and then Long Term profitability. It will store the sum
for the top ten movies based on probability then short term profitability and long term profitability.
It will then print the averages since it is only 1 user it will have the same value

'''

prob = [0,0,0]
st_prof = [0,0,0]
lt_prof = [0,0,0]
temp = valid_users.copy()
metrics = ['Probability','P(i)*R(i)','(P(i)*R(i)/(1-P(i))']
length_users = 1
for i in [601]:
    prob = [0,0,0]
    st_prof = [0,0,0]
    lt_prof = [0,0,0]
    i-=1
    df_sugg = pd.DataFrame(columns = ['user_id','age','sex'])
    df_user = users.iloc[i].copy()
    df_user = df_user.drop(['occupation','zip_code'], axis=0)
    if(df_user['sex'] == 'M'):
        df_user['sex'] = 1
    else:
        df_user['sex'] = 0
    df_sugg = df_sugg.append(df_user)
    df_sugg = df_sugg.set_index('user_id')
    nb_pred = GNB.predict(df_sugg)
    approx_pred = GNB.predict_proba(df_sugg)
    proba = approx_pred[0].copy()
    proba1 = proba.copy()
    mov = pd.DataFrame(proba,columns=['Probability'])
    mov['movie_id'] = None
    for j in range(len(movies1)):
        mov.loc[j,'movie_id'] = movies1[j]
    mov = mov.sort_values(['Probability'],ascending=False)
    combine = pd.merge(mov,movies_prof1,how='outer')
    combine = combine.dropna()
    combine = combine.reset_index(drop=True)
    k = 1
    combine['year'] = combine['release date'].astype(str).str.slice(0,4)
    combine['year'] = combine['year'].astype(int)
    combine['R'] = k + ( ((datetime.datetime.now()).year - combine['year']) / 2 )
    combine['P(i)*R(i)'] = combine['Probability'] * combine['R']
    combine['(P(i)*R(i)/(1-P(i))'] = combine['P(i)*R(i)'] / (1 - combine['Probability'])
    y=0
    print(df_sugg, "\n")
    print("Probability:")
    print((combine.sort_values('Probability',ascending=False)[:10][['movie title','Probability','R']]))
    print("Short term:")
    print((combine.sort_values('P(i)*R(i)',ascending=False)[:10][['movie title','Probability','R']]))
    print("Long term")
    print((combine.sort_values('(P(i)*R(i)/(1-P(i))',ascending=False)[:10][['movie title','Probability','R']]))
    print("\n\n",(combine.sort_values('P(i)*R(i)',ascending=False)[:10][['movie title','Probability','R']])\
         .equals(combine.sort_values('(P(i)*R(i)/(1-P(i))',ascending=False)[:10][['movie title','Probability','R']]))
    for jk in metrics:
        P_acc = (combine.sort_values('Probability',ascending=False))[jk]
        P_st = (combine.sort_values('P(i)*R(i)',ascending=False))[jk]
        P_lt = (combine.sort_values('(P(i)*R(i)/(1-P(i))',ascending=False))[jk]
        if(jk == 'Probability'):            
            prob[0]+=P_acc[:10].sum()
            prob[1]+=P_st[:10].sum()
            prob[2]+=P_lt[:10].sum()
        elif(jk == 'P(i)*R(i)'):
            st_prof[0]+=P_acc[:10].sum()
            st_prof[1]+=P_st[:10].sum()
            st_prof[2]+=P_lt[:10].sum()
        else:
            lt_prof[0]+=P_acc[:10].sum()
            lt_prof[1]+=P_st[:10].sum()
            lt_prof[2]+=P_lt[:10].sum()    

    len_valid_users  = length_users
    avg_prob_max_Pi = prob[0] / len_valid_users
    print("For most probable movie")
    print("Average for most probabile movie: ",avg_prob_max_Pi)
    avg_prob_st_Rev = prob[1] / len_valid_users
    print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
    avg_prob_lt_Rev = prob[2] / len_valid_users
    print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
          avg_prob_lt_Rev)

    print("For maximum short-term profit")
    len_valid_users  = length_users
    avg_prob_max_Pi = st_prof[0] / len_valid_users
    print("Average for most probabile movie: ",avg_prob_max_Pi)
    avg_prob_st_Rev = st_prof[1] / len_valid_users
    print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
    avg_prob_lt_Rev = st_prof[2] / len_valid_users
    print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
          avg_prob_lt_Rev)

    print(avg_prob_st_Rev == avg_prob_lt_Rev)
    print(avg_prob_st_Rev > avg_prob_lt_Rev)


    print("For maximum long-term profit")
    len_valid_users  = length_users
    avg_prob_max_Pi = lt_prof[0] / len_valid_users
    print("Average for most probabile movie: ",avg_prob_max_Pi)
    avg_prob_st_Rev = lt_prof[1] / len_valid_users
    print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
    avg_prob_lt_Rev = lt_prof[2] / len_valid_users
    print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
          avg_prob_lt_Rev)

    print(avg_prob_st_Rev == avg_prob_lt_Rev)
    print(avg_prob_st_Rev < avg_prob_lt_Rev)
    print("\n\n\n")


        age sex
user_id        
601      19   0 

Probability:
                        movie title  Probability     R
0                  Star Wars (1977)     0.006522  22.0
1                      Fargo (1996)     0.006152  12.0
2         Return of the Jedi (1983)     0.005559  12.0
3                    Contact (1997)     0.005400  12.0
4       English Patient, The (1996)     0.004967  12.5
5  Silence of the Lambs, The (1991)     0.004778  15.0
6                  Toy Story (1995)     0.004678  13.0
7    Raiders of the Lost Ark (1981)     0.004621  20.0
8                     Scream (1996)     0.004609  12.5
9             Godfather, The (1972)     0.004547  24.5
Short term:
                        movie title  Probability     R
0                  Star Wars (1977)     0.006522  22.0
49         Wizard of Oz, The (1939)     0.003037  41.0
56                Casablanca (1942)     0.002946  39.5
9             Godfather, The (1972)     0.004547  24.5
75     It's a Wonderful Life (1946)     0.002

In [9]:
'''

This section of the program looks at calculating the short term and long term profitability
of the movies recommended to all the users in the valid_users list. It will store a running total for the top
ten movies based upon probability, then short term profitability and then long term profitability.


'''

prob = [0,0,0]
st_prof = [0,0,0]
lt_prof = [0,0,0]
temp = valid_users.copy()
random.shuffle(temp)
metrics = ['Probability','P(i)*R(i)','(P(i)*R(i)/(1-P(i))']
xy = 0
for i in temp[:len(valid_users)]:
    xy+=1
    i-=1
    df_sugg = pd.DataFrame(columns = ['user_id','age','sex'])
    df_user = users.iloc[i].copy()
    df_user = df_user.drop(['occupation','zip_code'], axis=0)
    if(df_user['sex'] == 'M'):
        df_user['sex'] = 1
    else:
        df_user['sex'] = 0
    df_sugg = df_sugg.append(df_user)
    df_sugg = df_sugg.set_index('user_id')
    nb_pred = GNB.predict(df_sugg)
    approx_pred = GNB.predict_proba(df_sugg)
    proba = approx_pred[0].copy()
    proba1 = proba.copy()
    mov = pd.DataFrame(proba,columns=['Probability'])
    mov['movie_id'] = None
    for j in range(len(movies1)):
        mov.loc[j,'movie_id'] = movies1[j]
    mov = mov.sort_values(['Probability'],ascending=False)
    combine = pd.merge(mov,movies_prof1,how='outer')
    combine = combine.dropna()
    combine = combine.reset_index(drop=True)
    k = 1
    combine['year'] = combine['release date'].astype(str).str.slice(0,4)
    combine['year'] = combine['year'].astype(int)
    combine['R'] = k + ( ((datetime.datetime.now()).year - combine['year']) / 2 )
    combine['P(i)*R(i)'] = combine['Probability'] * combine['R']
    combine['(P(i)*R(i)/(1-P(i))'] = combine['P(i)*R(i)'] / (1 - combine['Probability'])
    y=0
    if(xy%50 == 0):
        print(df_sugg, "\n")
    for jk in metrics:
        P_acc = (combine.sort_values('Probability',ascending=False))[jk]
        P_st = (combine.sort_values('P(i)*R(i)',ascending=False))[jk]
        P_lt = (combine.sort_values('(P(i)*R(i)/(1-P(i))',ascending=False))[jk]
        if(jk == 'Probability'):            
            prob[0]+=P_acc[:10].sum()
            prob[1]+=P_st[:10].sum()
            prob[2]+=P_lt[:10].sum()
        elif(jk == 'P(i)*R(i)'):
            st_prof[0]+=P_acc[:10].sum()
            st_prof[1]+=P_st[:10].sum()
            st_prof[2]+=P_lt[:10].sum()
        else:
            lt_prof[0]+=P_acc[:10].sum()
            lt_prof[1]+=P_st[:10].sum()
            lt_prof[2]+=P_lt[:10].sum() 
            

        age sex
user_id        
618      15   0 

        age sex
user_id        
859      18   0 

        age sex
user_id        
667      35   1 

        age sex
user_id        
611      46   1 

        age sex
user_id        
630      26   0 

        age sex
user_id        
596      20   1 



In [10]:
'''

This section only looks at average values for probability,short term profitability and long term profitability 
based upon probability. It then does this based upon short term profitability and prints the average 
for probability,short term profitability and long term profitability. Finally it does it based upon long term
profitability and prints the averages for probability,short term profitability and long term profitability.

'''

len_valid_users  = len(valid_users)
avg_prob_max_Pi = prob[0] / len_valid_users
print("For most probable movie")
print("Average for most probabile movie: ",avg_prob_max_Pi)
avg_prob_st_Rev = prob[1] / len_valid_users
print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
avg_prob_lt_Rev = prob[2] / len_valid_users
print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
      avg_prob_lt_Rev)

print("For maximum short-term profit")
len_valid_users  = len(valid_users)
avg_prob_max_Pi = st_prof[0] / len_valid_users
print("Average for most probabile movie: ",avg_prob_max_Pi)
avg_prob_st_Rev = st_prof[1] / len_valid_users
print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
avg_prob_lt_Rev = st_prof[2] / len_valid_users
print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
      avg_prob_lt_Rev)

print(avg_prob_st_Rev == avg_prob_lt_Rev)
print(avg_prob_st_Rev > avg_prob_lt_Rev)


print("For maximum long-term profit")
len_valid_users  = len(valid_users)
avg_prob_max_Pi = lt_prof[0] / len_valid_users
print("Average for most probabile movie: ",avg_prob_max_Pi)
avg_prob_st_Rev = lt_prof[1] / len_valid_users
print("Average for movie that maximizes P(i) * R(i) :",avg_prob_st_Rev)
avg_prob_lt_Rev = lt_prof[2] / len_valid_users
print("Average for movie that maximizes (P(i)*R(i)/(1-P(i)) : ",\
      avg_prob_lt_Rev)

print(avg_prob_st_Rev == avg_prob_lt_Rev)
print(avg_prob_st_Rev < avg_prob_lt_Rev)
print("\n\n\n")

For most probable movie
Average for most probabile movie:  0.05169425308334059
Average for movie that maximizes P(i) * R(i) : 0.034408702213431036
Average for movie that maximizes (P(i)*R(i)/(1-P(i)) :  0.034466838877281865
For maximum short-term profit
Average for most probabile movie:  0.7778267156981936
Average for movie that maximizes P(i) * R(i) : 1.0275433098760196
Average for movie that maximizes (P(i)*R(i)/(1-P(i)) :  1.0275421472428978
False
True
For maximum long-term profit
Average for most probabile movie:  0.7819641689774485
Average for movie that maximizes P(i) * R(i) : 1.0312547145704118
Average for movie that maximizes (P(i)*R(i)/(1-P(i)) :  1.0312584660429076
False
True




