In [58]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer


class MovielensLoader:
    def __init__(self, data_dir=Path("./ml-100k"), user_filename="u.user"):
        self.data_dir = data_dir
        self.user_path = data_dir / user_filename

    def create_dataset(self, include_context_features=True):
        """datasetを作成.
        :param include_context_features: context featureをXに含めるか
        """
        users = self.load_users()
        df_train, y_train = self.load_log_and_ratings(log_filename="ua.base")
        df_test, y_test = self.load_log_and_ratings(log_filename="ua.test")
        if include_context_features:
            df_train = self.merge(df_train, users)
            df_test = self.merge(df_test, users)
        train_data=self.to_dict(df_train)
        test_data=self.to_dict(df_test)
        
        self.vectorizer = DictVectorizer()
        X_train = self.vectorizer.fit_transform(train_data)
        X_test = self.vectorizer.transform(test_data)
        X_train = csr_matrix(df_train, dtype=np.float)
        X_test = csr_matrix(df_test, dtype=np.float)
        return X_train, y_train, X_test, y_test
    
    def to_dict(self,df):
        return df.to_dict(orient='records')
    

    def load_log_and_ratings(self, log_filename, drop_columns=["timestamp"]):
        logs = pd.read_csv(self.data_dir / log_filename, names=["uid", "mid", "rating", "timestamp"], sep="\t")
        ratings = np.array(logs["rating"], dtype=np.float)
        drop_columns.append("rating")
        logs = logs.drop(drop_columns, axis=1)
        return logs, ratings

    def load_users(self, drop_columns=["age", "zip_code"]):
        users = pd.read_csv(self.user_path, names=["uid", "age", "gender", "occupation", "zip_code"], sep="|")
        users = users.drop(drop_columns, axis=1)
        return users

    def merge(self, logs, users):
        return pd.merge(logs, users, on="uid")
        #return pd.get_dummies(log_user_dummied, columns=["uid", "mid"])


In [59]:
loader=MovielensLoader()

In [53]:
df_train,df_test=loader.create_dataset()

In [54]:
df_train.tail(10)

Unnamed: 0,uid,mid,gender,occupation
90560,943,941,M,student
90561,943,943,M,student
90562,943,1011,M,student
90563,943,1028,M,student
90564,943,1044,M,student
90565,943,1047,M,student
90566,943,1074,M,student
90567,943,1188,M,student
90568,943,1228,M,student
90569,943,1330,M,student


[{'gender': 'M', 'mid': 20, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 33, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 61, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 117, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 155, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 160, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 171, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 189, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 202, 'occupation': 'technician', 'uid': 1},
 {'gender': 'M', 'mid': 265, 'occupation': 'technician', 'uid': 1},
 {'gender': 'F', 'mid': 13, 'occupation': 'other', 'uid': 2},
 {'gender': 'F', 'mid': 50, 'occupation': 'other', 'uid': 2},
 {'gender': 'F', 'mid': 251, 'occupation': 'other', 'uid': 2},
 {'gender': 'F', 'mid': 280, 'occupation': 'other', 'uid': 2},
 {'gender': 'F', 'mid': 281, 'occupation': 'other', 'uid': 2},
 {'gender'

In [None]:
df_train.columns,df_test.columns

In [None]:
for col_name in df_test.columns:
    if col_name not in df_train.columns:
        print(col_name)