-
Notifications
You must be signed in to change notification settings - Fork 0
/
data preprocessing.py
89 lines (72 loc) · 3.58 KB
/
data preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Data preprocessing, since the data downloaded is in .dat format
# Import packages
import os
import pandas as pd
# define relative path for the files
relative_path = "./data/"
user_data_file = 'users.dat'
movie_data_file = 'movies.dat'
rating_data_file = 'ratings.dat'
# Specify User's Age and Occupation Column
AGES = {1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = {0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
4: "college/grad student", 5: "customer service", 6: "doctor/health care",
7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer"}
# Define csv files to be saved into
user_csv_file = 'users.csv'
movie_csv_file = 'movies.csv'
rating_csv_file = 'ratings.csv'
# Read the Ratings File
ratings = pd.read_csv(relative_path + rating_data_file,
sep='::',
engine='python', # Parser engine to use
encoding='latin-1', # Encoding to use while reading file
names=['user_id', 'movie_id', 'rating', 'timestamp'])
# Set max_userid to the maximum user_id in the ratings and also remove duplicates of user ID if there's any
max_userid = max(ratings['user_id'].drop_duplicates())
# Set max_movieid to the maximum movie_id in the ratings and also remove duplicates of user ID if there's any
max_movieid = max(ratings['movie_id'].drop_duplicates())
# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_emb_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1
print(len(ratings), 'ratings loaded')
save_path = "./parsed data/"
ratings.to_csv(save_path + rating_csv_file,
sep='\t',
header=True,
encoding='latin-1',
columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print('Saved to', rating_csv_file)
# Read the Users File
users = pd.read_csv(relative_path + user_data_file,
sep='::',
engine='python',
encoding='latin-1',
names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print(len(users), 'descriptions of', max_userid, 'users loaded.')
# Save into users.csv
users.to_csv(save_path + user_csv_file,
sep='\t',
header=True,
encoding='latin-1',
columns=['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])
print('Saved to', user_csv_file)
# Read the Movies File
movies = pd.read_csv(relative_path + movie_data_file,
sep='::',
engine='python',
encoding='latin-1',
names=['movie_id', 'title', 'genres'])
print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')
# Save into movies.csv
movies.to_csv(save_path + movie_csv_file,
sep='\t',
header=True,
columns=['movie_id', 'title', 'genres'])
print('Saved to', movie_csv_file)