# Data Loading and Matrix Generation
### load raw .csv files
### create URM sparse matrix
### create ICM sparse matrix

## Import needed libraries and set datapath

In [5]:
import os
from glob import glob
import scipy.sparse as sps
import numpy as np

comp_year='2020'
data_folder_path='../Data/2020'
matrix_dir='../Processed Matrices'
URM_file_name='data_train.csv'
URM_file_path= os.path.join(data_folder_path,URM_file_name);
ICM_file_prefix='data_ICM_'

## Row Split Function

In [2]:
def rowSplit(line):
    """split line into cols and return as a tuple"""
    split= line.split(',')
    split[-1]=float(split[-1].replace("\n",""))
    split[0:-1]=[int(i) for i in split[0:-1]]
    
    return tuple(split)

## Load & Create sparse ICM matrix

In [3]:
#list of seperate ICM files
icm_files= [i.replace(" ","_") for i in os.listdir(data_folder_path) if os.path.isfile(os.path.join(data_folder_path,i)) and ICM_file_prefix in i]

# create sparse coo matrix from each file
ICM_list=[]
sum_col=0
for file in icm_files:
    file_tuples=[]
    icm_file=open(os.path.join(data_folder_path, file))
    _=icm_file.readline()
    for line in icm_file:
        file_tuples.append(rowSplit(line))
    itemList, attrList, valueList= zip(*file_tuples)
    ICM_list.append(sps.coo_matrix((valueList,(itemList, attrList))))
    
#stack matrices to create full ICM and save
ICM_all=sps.hstack(ICM_list)
sps.save_npz(matrix_dir+"/ICM_simple_coo_"+comp_year+".npz", ICM_all)

In [6]:
nonzero=ICM_all.nonzero()
ICM_all_nonweighted=sps.coo_matrix((np.ones(ICM_all.nnz),(nonzero[0],nonzero[1])))
sps.save_npz(matrix_dir+"/ICM_nonweighted_simple_coo_"+comp_year+".npz", ICM_all_nonweighted)

## Load & Create sparse URM matrix

In [15]:
URM_file= open(URM_file_path, 'r')

URM_tuples=[]

# skip first line containing column names
_=URM_file.readline()

for line in URM_file:
    URM_tuples.append(rowSplit(line))

#create user,item &rating lists
userList, itemList, ratingList= list(zip(*URM_tuples))

#consruct sparse URM matrix and save
import scipy.sparse as sps

URM_all=sps.coo_matrix((ratingList,(userList,itemList)))
sps.save_npz(matrix_dir+"/URM_simple_coo_"+comp_year+".npz", URM_all)

## Train, Validation & Test Split

In [17]:
#import pandas and train_test_split to create train and test sets
import pandas as pd
from sklearn.model_selection import train_test_split

#create dataframe to be split
URM_dict={'user':userList, 'item':itemList, 'rating':ratingList}
URM_all_df= pd.DataFrame(URM_dict)

#split URM data to the 3 sets
URM_train_df, URM_valid_df = train_test_split(URM_all_df, test_size=0.3,shuffle=True, random_state=41148)

#create sparse matrices and save
train_userList = list(URM_train_df['user'])
train_itemList = list(URM_train_df['item'])
train_ratingList = list(URM_train_df['rating'])
URM_train = sps.coo_matrix((train_ratingList , (train_userList , train_itemList)))
sps.save_npz(matrix_dir+"/URM_train_simple_coo_"+comp_year+".npz" , URM_train)

valid_userList = list(URM_valid_df['user'])
valid_itemList = list(URM_valid_df['item'])
valid_ratingList = list(URM_valid_df['rating'])
URM_valid = sps.coo_matrix((valid_ratingList , (valid_userList , valid_itemList)))
sps.save_npz(matrix_dir+"/URM_valid_simple_coo_"+comp_year+".npz" , URM_valid)
