# Transform (Feature Creation)

How do we create the user-item matrix for the HackerNews Dataset
- OneHot Encoding
- Sparse Matrix (for scalability)

## Creating the User-Item Matrix

In [41]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse

In [29]:
story_user_comment = pd.read_csv("data/story_user_comment.csv", low_memory=True)

In [30]:
story_user_comment.head()

Unnamed: 0,user,story,comment
0,21,14356377,1
1,21,15131370,1
2,21,15196309,1
3,47,15601729,1
4,47,14023198,1


### Using One Hot Transaction

In [31]:
# ! pip install mlxtend

In [32]:
from mlxtend.preprocessing import OnehotTransactions

In [33]:
def OHE_Matrix( df ) :

    g2 = df.groupby(["user"], as_index = False)
    
    Itemset = []
    user = []
    for item in list(g2.groups.keys()) :
        Itemset.append( list(g2.get_group(item)["story"]))
        user.append(item) 
        
    oht = OnehotTransactions()
    u = oht.fit(Itemset).transform(Itemset)
    
    Matrix = pd.DataFrame(u, columns = oht.columns_)
    Matrix["user"] = user
    Matrix = Matrix.set_index("user")
    
    return Matrix

In [34]:
matrix1 = OHE_Matrix(story_user_comment)

In [35]:
matrix1.head()

Unnamed: 0_level_0,13296502,13297792,13301832,13309025,13309610,13312629,13317902,13319904,13326535,13326792,...,15897809,15900551,15902054,15904265,15905048,15908812,15909395,15913250,15916121,15919115
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
010001001010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01096232042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
010a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Using Sparse Matrix

Typically, these matrices are sparse matrix and require us to use sparse data structures to stores them

In [36]:
n_users = story_user_comment.user.unique().shape[0]
n_stories = story_user_comment.story.unique().shape[0]

In [37]:
print('Number of users: {}'.format(n_users))
print('Number of stories: {}'.format(n_stories))
print('Sparsity: {:4.3f}%'.format(float(story_user_comment.shape[0]) / float(n_users*n_stories)))

Number of users: 23230
Number of stories: 969
Sparsity: 0.002%


In [44]:
def Sparse_Matrix(df):
    
    # Create mappings
    user_to_index = {}
    index_to_user = {}
    for (index, user) in enumerate(df.user.unique().tolist()):
        user_to_index[user] = index
        index_to_user[index] = user

    story_to_index = {}
    index_to_story = {}
    for (index, story) in enumerate(df.story.unique().tolist()):
        story_to_index[story] = index
        index_to_story[index] = story
        
    # Create a map id function
    def map_ids(row, mapper):
        return mapper[row]
    
    # Apply the map id function 
    I = df.user.apply(map_ids, args=[user_to_index]).as_matrix()
    J = df.story.apply(map_ids, args=[story_to_index]).as_matrix()
    V = np.ones(I.shape[0])
    
    # Create the Matrix
    story_user = sparse.coo_matrix((V, (I, J)), dtype=np.float64)
    story_user_matrix = story_user.tocsr()
    
    return story_user_matrix

In [45]:
matrix2 = Sparse_Matrix(story_user_comment)

In [46]:
matrix2

<23230x969 sparse matrix of type '<class 'numpy.float64'>'
	with 50975 stored elements in Compressed Sparse Row format>