In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy.sparse import csr_matrix


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/final_df_500k.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.152381,2
1,3113,1259176,4.0,2000-11-10,2000,4,4,3.152381,2
2,3113,739622,4.0,2001-07-15,2001,3,4,3.152381,2
3,3113,1709239,3.0,2001-09-10,2001,4,4,3.152381,2
4,3113,2618750,3.0,2001-10-11,2001,4,4,3.152381,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mid           500000 non-null  int64  
 1   cust_id       500000 non-null  int64  
 2   rating        500000 non-null  float64
 3   r_date        500000 non-null  object 
 4   r_year        500000 non-null  int64  
 5   r_fiscQ       500000 non-null  int64  
 6   m_decade      500000 non-null  int64  
 7   m_avg_rating  500000 non-null  float64
 8   m_quality     500000 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 34.3+ MB


In [5]:
data['r_date'] = data['r_date'].astype('datetime64[ns]')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   mid           500000 non-null  int64         
 1   cust_id       500000 non-null  int64         
 2   rating        500000 non-null  float64       
 3   r_date        500000 non-null  datetime64[ns]
 4   r_year        500000 non-null  int64         
 5   r_fiscQ       500000 non-null  int64         
 6   m_decade      500000 non-null  int64         
 7   m_avg_rating  500000 non-null  float64       
 8   m_quality     500000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(6)
memory usage: 34.3 MB


In [6]:
data = data.sort_values(by=['r_date'])
data

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.152381,2
105,11242,830363,3.0,1999-12-21,1999,4,3,3.821138,3
316,14574,882798,3.0,1999-12-30,1999,4,4,3.444874,3
833,17563,2114455,3.0,1999-12-30,1999,4,1,3.711111,3
878,953,1572303,3.0,1999-12-30,1999,4,1,3.651163,3
...,...,...,...,...,...,...,...,...,...
102795,2965,1271554,4.0,2005-12-31,2005,4,0,3.940000,4
481441,8393,1236402,4.0,2005-12-31,2005,4,5,3.773723,3
227426,5402,162834,3.0,2005-12-31,2005,4,2,3.666667,3
226787,6237,1271554,3.0,2005-12-31,2005,4,3,3.396226,2


In [7]:
scaler = MinMaxScaler()
data[['r_scaled']] = scaler.fit_transform(data[['rating']])
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality,r_scaled
0,3113,510180,3.0,1999-12-12,1999,4,4,3.152381,2,0.5
105,11242,830363,3.0,1999-12-21,1999,4,3,3.821138,3,0.5
316,14574,882798,3.0,1999-12-30,1999,4,4,3.444874,3,0.5
833,17563,2114455,3.0,1999-12-30,1999,4,1,3.711111,3,0.5
878,953,1572303,3.0,1999-12-30,1999,4,1,3.651163,3,0.5


In [8]:
data['r_scaled'].mean()

0.651175

In [9]:
data_pivot = data.pivot(index='cust_id', columns='mid', values='rating')
data_pivot.head()

  data_pivot = data.pivot(index='cust_id', columns='mid', values='rating')


mid,1,2,3,5,6,8,9,13,15,16,...,17761,17762,17763,17764,17765,17766,17767,17768,17769,17770
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
42,,,,,,,,,,,...,,,,,,,,,,
79,,,,,,,,,,,...,,,,,,,,,,


In [None]:
data_pivot = data_pivot.fillna(.65)
data_pivot.head()

In [None]:
data_pivot[79].info()

In [None]:

data_scaled

In [None]:
np.nan_to_num(data_scaled, nan=.5)

In [None]:
%notify

# Train  Test Split

temporal split

In [None]:
testsize = round(len(data) * 0.2)
testsize

In [None]:
#split based on time
df_train = data[:-(2*testsize)]
df_val = data[-(2*testsize):-testsize]
df_test = data[-testsize:]

In [None]:
df_train

In [None]:
df_val

In [None]:
df_test

# Train

In [None]:
f,ax = plt.subplots()

sns.scatterplot(x=X[:,0], y=X[:,1], ax=ax, hue=y, palette='colorblind')
ax.get_legend().remove()

In [None]:
train_table = df_train.pivot(index='cust_id', columns='mid', values='rating')
train_table.head()

In [None]:
train_table = scaler.transform(train_table)

In [None]:
np.nan_to_num(train_table, nan=.5)

In [None]:
%notify

# Val

In [None]:
val_table = df_val.pivot(index='cust_id', columns='mid', values='rating')
val_table.head()

# Test

In [None]:
test_table = df_test.pivot(index='cust_id', columns='mid', values='rating')
test_table.head()

In [None]:
train_scaled = scaler.fit_transform(train_table)


In [None]:
train_scaled = train_scaled.fillna(.5)
train_scaled

In [None]:
%notify

In [None]:
%notify