## Behaviors
Convert behaviors.tsv files into csv files that we will use for training, validation, and testing

### Import Data & Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# change directory if data is in a different folder
import os
os.chdir("C:/Users/tanmb/Downloads/SJSU/CMPE_256/project/project_data")

In [3]:
# run for each behavior file (train, test, validation)
behaviors_train = pd.read_csv("./train/behaviors.tsv", 
                        sep='\t', 
                        names = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions'])
behaviors_train.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...


In [4]:
behaviors_val = pd.read_csv("./val/behaviors.tsv", 
                        sep='\t', 
                        names = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions'])
behaviors_val.head()

Unnamed: 0,impression_id,user_id,timestamp,history,impressions
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...
2,3,U499841,11/15/2019 9:08:21 AM,N63858 N26834 N6379 N85484 N15229 N65119 N1047...,N18190-0 N89764-0 N91737-0 N54368-0 N49978-1 N...
3,4,U107107,11/15/2019 5:50:31 AM,N12959 N8085 N18389 N3758 N9740 N90543 N129790...,N122944-1 N18190-0 N55801-0 N59297-0 N128045-0...
4,5,U492344,11/15/2019 5:02:25 AM,N109183 N48453 N85005 N45706 N98923 N46069 N35...,N64785-0 N82503-0 N32993-0 N122944-0 N29160-0 ...


In [8]:
def clean_behaviors_to_csv(df, path, test_set) :
    df = df.drop(columns = ['impression_id','history'])
    
    # split impressions into individual news-impression
    df['impressions'] = df['impressions'].str.split()
    # give each news-impression it's own row
    df = df.explode('impressions', ignore_index = True)

    # get new_id and impression into separate columns if not the test_set
    if not test_set :
        df['impressions'] = df['impressions'].str.split('-')
        df['news_id'] = df['impressions'].str[0]
        df['impression'] = df['impressions'].str[1].astype(int)

    # convert timestamp into seconds since first data collection
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%m/%d/%Y %I:%M:%S %p')
    
    min_time = datetime(2019, 11, 9, 0, 0, 0)
    df['timestamp'] = (df['timestamp'] - min_time).dt.total_seconds().astype('int')

    # convert ids to integers, remove letter at beginning
    df['user_id'] = df['user_id'].str.replace('U', '').astype(int)
    df['news_id'] = df['news_id'].str.replace('N', '').astype(int)

    # drop irrelevant columns
    df = df.drop(columns = 'impressions')

    clicked = (df['impression'] == 1)
    not_clicked =  (df['impression'] == 0)

    #write to csv based on clicked or not
    df[clicked].to_csv(path + '_clicked.csv', index = False)
    df[not_clicked].to_csv(path + '_not_clicked.csv', index = False)

In [10]:
# export to train/test/validation + _behaviors_cleaned.csv
clean_behaviors_to_csv(behaviors_train, 'full_train', False)
clean_behaviors_to_csv(behaviors_val, 'full_val', False)