# Assigment 1

## Overhead

In [74]:
# Imports
import gzip
import math
import torch
import spacy
import warnings
import pandas as pd
import numpy as np

from datetime import date
from rake_nltk import Rake
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import BertTokenizer, BertModel

In [75]:
# Notebook setup
tqdm.pandas()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
warnings.simplefilter("ignore")

In [76]:
# Helper model load
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Data load
def readJSON(path):
    for l in tqdm(gzip.open(path, 'rt', encoding='utf-8'), total=175000):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d
        
allHours = []
userHours = defaultdict(list)
for user,game,d in readJSON("./../data/train.json.gz"):
    allHours.append(d)
    userHours[user].append(d)

  0%|          | 0/175000 [00:00<?, ?it/s]

## Proccessing

In [7]:
# ETL
df = pd.DataFrame(allHours)
df.head(3)

Unnamed: 0,hours,gameID,hours_transformed,early_access,date,text,userID,found_funny,user_id,compensation
0,0.3,g35322304,0.378512,False,2015-04-08,+1,u55351001,,,
1,63.5,g49368897,6.011227,False,2017-05-20,If you want to sit in queue for 10-20min and h...,u70666506,1.0,7.656119803040877e+16,
2,0.2,g73495588,0.263034,False,2017-01-27,I was really not a fan of the gameplay. Games ...,u18612571,,,


In [35]:
# getting played
df_played = df[['userID', 'gameID']]
df_played['played'] = 1

In [30]:
# Create non existing
existing_pairs = set(zip(df_played['userID'], df_played['gameID']))
df_not_played = pd.DataFrame(columns=['userID', 'gameID', 'played'])

for i in tqdm(range(len(df_played))):
    user, game, _ = df_played.iloc[i]
    not_played_game = None
    while not not_played_game or (user, not_played_game) in existing_pairs:
        not_played_game = np.random.choice(df_played['gameID'])
    df_not_played = df_not_played.append({'userID': user, 'gameID': not_played_game, 'played': 0}, ignore_index=True)

  0%|          | 0/175000 [00:00<?, ?it/s]

In [36]:
# Combining
df_combined = pd.concat([df_played, df_not_played])
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_combined.head(3)

Unnamed: 0,userID,gameID,played
0,u66610671,g10773791,0
1,u08543832,g31190276,1
2,u78094706,g02141122,1


In [8]:
# Helpers
review_dates = list(map(lambda x: date.fromisoformat(x['date']), allHours))
TIME_DIFF = max(review_dates) - min(review_dates)
MIN_REVIEW_DATE = min(review_dates)

def linearTransform(timestamp):
    return (date.fromisoformat(timestamp) - MIN_REVIEW_DATE) / TIME_DIFF

def reviewExp(timestamp):
    return math.exp(timestamp * 0.0001)

In [13]:
# Feature Engineering
df['text_length'] = df['text'].str.len()
df['review_date_linear'] = df['date'].transform(linearTransform)
df['review_date_exp'] = df['review_date_linear'].transform(reviewExp)

## Testing