In [None]:
import numpy as np
import pandas as pd

In [None]:

def train_test_split(train_fraction):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

    rating_df.drop(index= range(20000,1000209), inplace=True)

    train_rating_df = rating_df.sample(frac = train_fraction, random_state=201)
    test_rating_df = rating_df.drop(train_rating_df.index)

    train_users = max(train_rating_df["UserId"])
    train_movies = max(train_rating_df["MovieId"])

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    assert train_movies == test_movies

    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

    zero_mat = np.zeros(train_users)
    for i in range(1, train_movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
            test_rating_df = test_rating_df.join(zero_df)

    train_rating_df = train_rating_df[range(1, train_movies + 1)]
    test_rating_df = test_rating_df[range(1, test_movies + 1)]

    train_rating_matrix = np.asarray(train_rating_df)
    test_rating_matrix = np.asarray(test_rating_df)

    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [None]:
train_rating_matrix, test_rating_matrix = Train_Test_Split.train_test_split(0.8)

In [None]:
def mean_center(train_rating_matrix, test_rating_matrix):
    assert train_rating_matrix.shape == test_rating_matrix.shape
    A = train_rating_matrix + test_rating_matrix
    
    train_rating_matrix_centered = np.zeros(A.shape)
    test_rating_matrix_centered = np.zeros(A.shape)
    
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in train_rating_matrix[row].nonzero():
            train_rating_matrix_centered[row][i] = A[row][i] - mean
        for j in test_rating_matrix[row].nonzero():
            test_rating_matrix_centered[row][j] = A[row][j] - mean
    return train_rating_matrix_centered, test_rating_matrix_centered