In [1]:
import nltk
import pandas as pd
import os

# Sample Dataset

In [2]:
rawdf = pd.read_csv('../data/boardgame/boardgame-comments-sample.csv')
rawdf.columns = ['userID','gameID','rating','comment']
rawdf.head(10)

Unnamed: 0,userID,gameID,rating,comment
0,25308,37111,8.5,++++ Thematic +++ Bluff - Many randomness I ...
1,54313,12,8.0,"""well, that ugly artwork certainly helped imme..."
2,38165,50,8.0,LOVE this game! If only the GF would play it ...
3,56936,1198,5.0,"SET is a good game, in small doses (in my case..."
4,47675,164153,6.0,"So much better than Descent. Better theme, imp..."
5,94529,31260,8.0,Very tense and fun and I love pondering my str...
6,122420,320,6.0,A classic.
7,55342,161970,4.5,rating based on 2 player game.
8,147851,13,8.0,"A good game, but sometimes the dice are agains..."
9,171066,13291,7.0,Same rating as base game. New characters are g...


In [3]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize each comment and save in dict
def get_tokens(col):
    tokens = {}
    for i in col.index:
        token = word_tokenize(col[i])
        # make all lowercase and only take words that are longer than 2 characters
        token = [word.lower() for word in token if word.isalpha() and len(word)>1]
        tokens[i]  = token
    return tokens

In [4]:
%%time
tokens = get_tokens(rawdf['comment'])

CPU times: user 358 ms, sys: 7.03 ms, total: 365 ms
Wall time: 373 ms


In [5]:
rawdf['tokens'] = tokens.values()

In [6]:
%%time
df = pd.DataFrame()
m=0
for i in rawdf.index:
    for token in tokens[i]:
        row = dict(rawdf.iloc[i, :3]) 
        row['token'] = token
        df = df.append(row, ignore_index=True)
        m+=1

CPU times: user 1min 49s, sys: 847 ms, total: 1min 50s
Wall time: 1min 51s


In [7]:
word_set = list(set([i for l in list(tokens.values()) for i in l]))

In [8]:
dummies = pd.get_dummies([i for l in list(tokens.values()) for i in l],prefix=None)

In [9]:
rawdf.head()

Unnamed: 0,userID,gameID,rating,comment,tokens
0,25308,37111,8.5,++++ Thematic +++ Bluff - Many randomness I ...,"[thematic, bluff, many, randomness, really, li..."
1,54313,12,8.0,"""well, that ugly artwork certainly helped imme...","[well, that, ugly, artwork, certainly, helped,..."
2,38165,50,8.0,LOVE this game! If only the GF would play it ...,"[love, this, game, if, only, the, gf, would, p..."
3,56936,1198,5.0,"SET is a good game, in small doses (in my case...","[set, is, good, game, in, small, doses, in, my..."
4,47675,164153,6.0,"So much better than Descent. Better theme, imp...","[so, much, better, than, descent, better, them..."


In [10]:
id_cols = rawdf[['gameID', 'userID', 'rating']]

In [11]:
id_cols.columns = ['gameID', 'userID', 'Rating']
onehot = pd.concat([id_cols, dummies], axis=1)
one = onehot.groupby(['gameID', 'userID', 'Rating']).sum().reset_index();

In [14]:
one

Unnamed: 0,gameID,userID,Rating,aaahhh,aarg,abbey,abilities,ability,able,aboms,...,youre,yourself,yr,yspahan,yup,zertz,zman,zombie,zombies,zooleretto
0,3.0,24732.0,7.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.0,27836.0,9.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,69476.0,6.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,100693.0,6.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,130258.0,7.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5.0,27003.0,6.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,154001.0,7.50,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5.0,192801.0,8.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10.0,52258.0,7.20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10.0,181797.0,6.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df.to_csv('token-long-sample.csv', index=False)
one.to_csv('onehot-sample.csv', index=False)

# Full Dataset

In [13]:
full = pd.read_csv('C:/Users/Kenny/projects/pds/NLP-boardgames/data/boardgame-comments-english.csv')

FileNotFoundError: File b'C:/Users/Kenny/projects/pds/NLP-boardgames/data/boardgame-comments-english.csv' does not exist

In [None]:
full.info()

In [None]:
full.isnull().any()

In [None]:
full.columns = ['userID', 'gameID', 'rating', 'comment']

In [None]:
%%time
tokens = get_tokens(full['comment'])

In [None]:
%%time
df = pd.DataFrame()
m=0
for i in rawdf.index:
    for token in tokens[i]:
        row = dict(rawdf.iloc[i, :3]) 
        row['token'] = token
        df = df.append(row, ignore_index=True)
        m+=1

In [None]:
dummies = pd.get_dummies(df['token'], prefix=None)
id_cols = df[['gameID', 'userID', 'rating']]
id_cols.columns = ['gameID', 'userID', 'Rating']
onehot = pd.concat([id_cols, dummies], axis=1)
one = onehot.groupby(['gameID', 'userID', 'Rating']).sum().reset_index()
one

In [None]:
df.to_csv('token-long-full.csv', index=False)
one.to_csv('onehot-full.csv', index=False)