In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import NearestNeighbors

from scipy.sparse import csr_matrix


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/final_df.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2
1,3113,739622,4.0,2001-07-15,2001,3,4,3.0,2
2,3113,200684,1.0,2001-10-15,2001,4,4,3.0,2
3,3113,2559624,3.0,2002-08-27,2002,3,4,3.0,2
4,3113,1557262,3.0,2003-11-06,2003,4,4,3.0,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mid           100000 non-null  int64  
 1   cust_id       100000 non-null  int64  
 2   rating        100000 non-null  float64
 3   r_date        100000 non-null  object 
 4   r_year        100000 non-null  int64  
 5   r_fiscQ       100000 non-null  int64  
 6   m_decade      100000 non-null  int64  
 7   m_avg_rating  100000 non-null  float64
 8   m_quality     100000 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 6.9+ MB


In [5]:
data['r_date'] = data['r_date'].astype('datetime64[ns]')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   mid           100000 non-null  int64         
 1   cust_id       100000 non-null  int64         
 2   rating        100000 non-null  float64       
 3   r_date        100000 non-null  datetime64[ns]
 4   r_year        100000 non-null  int64         
 5   r_fiscQ       100000 non-null  int64         
 6   m_decade      100000 non-null  int64         
 7   m_avg_rating  100000 non-null  float64       
 8   m_quality     100000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(6)
memory usage: 6.9 MB


In [6]:
data = data.sort_values(by=['r_date'])
data

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.000000,2
19,11242,830363,3.0,1999-12-21,1999,4,3,3.843750,3
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3
...,...,...,...,...,...,...,...,...,...
56222,11443,1472304,4.0,2005-12-31,2005,4,5,4.186441,4
84549,12596,1472871,3.0,2005-12-31,2005,4,2,2.600000,1
29199,5317,1835727,1.0,2005-12-31,2005,4,5,3.403361,2
30200,8376,1466372,4.0,2005-12-31,2005,4,5,3.545455,3


In [7]:
scaler = MinMaxScaler()
data[['r_scaled']] = scaler.fit_transform(data[['rating']])
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality,r_scaled
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2,0.5
19,11242,830363,3.0,1999-12-21,1999,4,3,3.84375,3,0.5
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4,0.75
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3,0.5
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3,0.75


In [8]:
data['r_scaled'].mean()

0.6508525

In [12]:
data_pivot = data.pivot(index='cust_id', columns='mid', values='r_scaled')
data_pivot.head()

mid,1,5,6,8,15,16,17,18,19,21,...,17756,17758,17760,17761,17762,17763,17764,17767,17768,17769
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,,,,,...,,,,,,,,,,
195,,,,,,,,,,,...,,,,,,,,,,
199,,,,,,,,,,,...,,,,,,,,,,


In [13]:
data_pivot = data_pivot.fillna(.65)
data_pivot.head()

mid,1,5,6,8,15,16,17,18,19,21,...,17756,17758,17760,17761,17762,17763,17764,17767,17768,17769
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,...,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65
7,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,...,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65
134,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,...,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65
195,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,...,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65
199,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,...,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65,0.65


In [11]:
data_pivot[79].info()

<class 'pandas.core.series.Series'>
Index: 77372 entries, 6 to 2649378
Series name: 79
Non-Null Count  Dtype  
--------------  -----  
77372 non-null  float64
dtypes: float64(1)
memory usage: 1.2 MB


In [15]:
data_csr = csr_matrix(data_pivot.values)

In [22]:
knn = NearestNeighbors(metric='euclidean', algorithm='brute')

In [23]:
knn.fit(data_csr)

NearestNeighbors(algorithm='brute', metric='euclidean')

In [24]:
knn.n_features_in_

9227

In [35]:
knn.kneighbors([[6,1]], n_neighbors=5, return_distance=True)

ValueError: X has 2 features, but NearestNeighbors is expecting 9227 features as input.

In [None]:

data_scaled

In [None]:
np.nan_to_num(data_scaled, nan=.5)

In [None]:
%notify

# Train  Test Split

temporal split

In [None]:
testsize = round(len(data) * 0.2)
testsize

In [None]:
#split based on time
df_train = data[:-(2*testsize)]
df_val = data[-(2*testsize):-testsize]
df_test = data[-testsize:]

In [None]:
df_train

In [None]:
df_val

In [None]:
df_test

# Train

In [None]:
f,ax = plt.subplots()

sns.scatterplot(x=X[:,0], y=X[:,1], ax=ax, hue=y, palette='colorblind')
ax.get_legend().remove()

In [None]:
train_table = df_train.pivot(index='cust_id', columns='mid', values='rating')
train_table.head()

In [None]:
train_table = scaler.transform(train_table)

In [None]:
np.nan_to_num(train_table, nan=.5)

In [None]:
%notify

# Val

In [None]:
val_table = df_val.pivot(index='cust_id', columns='mid', values='rating')
val_table.head()

# Test

In [None]:
test_table = df_test.pivot(index='cust_id', columns='mid', values='rating')
test_table.head()

In [None]:
train_scaled = scaler.fit_transform(train_table)


In [None]:
train_scaled = train_scaled.fillna(.5)
train_scaled

In [None]:
%notify

In [None]:
%notify