In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
input_path = "/content/gdrive/My Drive/Colab Notebooks/"

In [0]:
!pip install surprise

In [0]:
from surprise import *
import pandas as pd
from surprise import accuracy
from sklearn.model_selection import train_test_split

In [0]:
def create_train_test_data(train_data_split):
    train_df = pd.read_csv("/content/gdrive/My Drive/training.csv", delimiter=",")  # read csv into df
    reader = Reader(rating_scale=(1, 5))  # invoke reader instance of surprise library
    print("Done with reading training data")
    if train_data_split == 1:
        # train_df = train_df.sample(frac=0.7)
        train_set_df, test_set_df = train_test_split(train_df, test_size=0.20)
        train_dataset = Dataset.load_from_df(train_set_df, reader)
        test_set_df.drop(['rating'], axis=1, inplace=True)
        return train_dataset, test_set_df
    else:
        train_df.drop(['helpful' , 'reviewText', 'reviewTime', 'reviewerName', 'summary', 'unixReviewTime'] , axis=1, inplace=True)# drop timestamp coloumn
        train_dataset = Dataset.load_from_df(train_df[['reviewerID', 'asin', 'overall']], reader)
        train_set = train_dataset.build_full_trainset()
    test_df = pd.read_csv("gdrive/My Drive/test_with_asin_reviewerID.csv", delimiter=",")
    print("Done with reading test data")
    return train_set, test_df

In [0]:
def train_model_3(train_set, train_data_split):
    print("Started training KNNBaseline..")
    '''
    param_grid = {'k': [15, 20, 25, 30, 40, 50, 60]}
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5, n_jobs=5)
    gs.fit(train_set)
    print("Done with model 2 training")
    print(gs.best_params['rmse'])
    knn_baseline = KNNBaseline(gs.best_params['rmse'])
    '''
    knn_baseline = KNNBaseline()
    if train_data_split == 1:
        knn_baseline.fit(train_set.build_full_trainset())
    else:
        knn_baseline.fit(train_set)
    print("Done with KNNBaseline training")
    return knn_baseline

In [0]:
def train_model_2(train_set, train_data_split):
    print("Started training SVD..")
    '''
    param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6]}
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
    gs.fit(train_set)
    print("Done with model 2 training")
    print(gs.best_params['rmse'])
    svd = SVD(gs.best_params['rmse'])
    '''
    svd = SVD(n_factors=4,
            n_epochs=80,
            biased=False,
            init_mean=0.0,
            init_std_dev=0.2,
            lr_bu = 0.39,
            lr_bi = 0.25,
            lr_pu = 0.055,
            lr_qi = 0.055,
            reg_bu = 0.045,
            reg_bi = 0.040,
            reg_pu = 0.0002,
            reg_qi = 0.0003,
            #lr_all=0.0085,
            #reg_all=0.55,
            random_state=4,
            verbose=False)
    if train_data_split == 1:
        svd.fit(train_set.build_full_trainset())
    else:
        svd.fit(train_set)
    print("Done with SVD training")
    return svd

In [0]:
def train_model_1(train_set, train_data_split):
    print("Started training BaselineOnly..") 
    '''
    param_grid = {'bsl_options': {'method': ['als'],
                                  'n_epochs': [40, 50, 60],
                                  'reg_i': [2, 5, 8],  # lambda 2
                                  'reg_u': [1, 2, 3],  # lambda 3
                                  }
                  }
    gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=3, return_train_measures=True, n_jobs=1)
    gs.fit(train_set)
    print("Done with model 1 training")
    print(gs.best_params['rmse'])
    base_line_only = BaselineOnly(gs.best_params['rmse'])
    '''
    # this is baseline configuration for optimizing the error
    bsl_options = {'method': 'als',  # another option is sgd
                   'n_epochs': 60,  # number of iterations
                   'reg_u': 2,  # user-regularisation parameter
                   'reg_i': 8  # item-regularisation parameter
                   }
    bl = BaselineOnly(bsl_options=bsl_options)
    if train_data_split == 1:
        bl.fit(train_set.build_full_trainset())
    else:
        bl.fit(train_set)
    print("Done with BaselineOnly training")
    return bl

In [0]:
def predict_rating(test_df, model_1, model_2, model_3, train_data_split):
    if train_data_split == 1:
        if model_1 is not None:
            predictions_1 = model_1.test(test_df)
            accuracy.rmse(predictions_1, verbose=True)
        if model_2 is not None:
            predictions_2 = model_2.test(test_df)
            accuracy.rmse(predictions_2, verbose=True)
        if model_3 is not None:
            predictions_3 = model_3.test(test_df)
            accuracy.rmse(predictions_3, verbose=True)
    else:
        output_predictions_file = open('gdrive/My Drive/amazon_rating.csv', 'w')
        output_predictions_file.write("key" + "," + "overall" + "\n")
        print("Predicting ratings..")
        for i in range(0, len(test_df)):
            if i % 50000 == 0:
                print(i)
            rating_p1 = None
            rating_p2 = None
            rating_p3 = None
            output = None
            predicted_rating = 0.0
            model_count = 0
            if model_1 is not None:
                rating_p1 = model_1.predict(uid=test_df.iloc[i]['reviewerID'], iid=test_df.iloc[i]['asin'])
                output = str(rating_p1[0]) + "-" + str(rating_p1[1]) + ","
                predicted_rating = predicted_rating + float(rating_p1[3])
                model_count = model_count + 1
            if model_2 is not None:
                rating_p2 = model_2.predict(uid=test_df.iloc[i]['reviewerID'], iid=test_df.iloc[i]['asin'])
                output = str(rating_p2[0]) + "-" + str(rating_p2[1]) + ","
                predicted_rating = predicted_rating + float(rating_p2[3])
                model_count = model_count + 1
            if model_3 is not None:
                rating_p3 = model_3.predict(uid=test_df.iloc[i]['reviewerID'], iid=test_df.iloc[i]['asin'])
                output = str(rating_p3[0]) + "-" + str(rating_p3[1]) + ","
                predicted_rating = predicted_rating + float(rating_p3[3])
                model_count = model_count + 1
            if model_count > 0:
               # predicted_rating = str(round(float(float(predicted_rating) / float(model_count))));
                output = output + str(predicted_rating)
            #print(rating_p1, rating_p2, rating_p3)
            #print(output)
            output_predictions_file.write(output + "\n")
        output_predictions_file.close()

In [0]:
def main():
    train_data_split = 0
    train_data, test_data = create_train_test_data(train_data_split)
    #model_1 = train_model_1(train_data, train_data_split)
    model_1 = None
    model_2 = train_model_2(train_data, train_data_split)
   # model_3 = train_model_3(train_data, train_data_split)
    model_3 = None
    predict_rating(test_data, model_1, model_2, model_3, train_data_split)

In [0]:
main()