In [1]:
# library import
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import torch
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import os
import re
import glob
import shutil
import gc
from pathlib import Path

# showing module
from IPython.display import display

# output display option adjustment
# precision of floating point in numpy
np.set_printoptions(suppress=True, precision=4)

# precision of floating point in pandas
pd.options.display.float_format = '{:.4f}'.format

# display all columns in dataframe
pd.set_option("display.max_columns",None)

# default font size in graph
plt.rcParams["font.size"] = 14

# graph display
sns.set(rc={'figure.figsize':(12,5)});
plt.figure(figsize=(12,5));

# random seed
random_seed = 45

<Figure size 1200x500 with 0 Axes>

In [4]:
# create the input_dir（input directory）
current_note_path = os.path.dirname(os.path.abspath('__file__'))
INPUT_DIR = os.path.join(current_note_path, "data")

# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(current_note_path, 'outputs')

# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [5]:
# Once you run this code, comment it out
# move csv files to `data` directory(=folder)
unique_dir_names = []
for f in Path(f'{current_note_path}').rglob('*.csv'):
    unique_dir_names.append(f)

for file in list(set(unique_dir_names)):
    print(f'moved file: {file}')
    shutil.move(f'{file}', f'{INPUT_DIR}')

moved file: /Users/satoshiido/Documents/programming/kaggle/student_performance_from_game_play/predict-student-performance-from-game-play/train_labels.csv
moved file: /Users/satoshiido/Documents/programming/kaggle/student_performance_from_game_play/predict-student-performance-from-game-play/train.csv
moved file: /Users/satoshiido/Documents/programming/kaggle/student_performance_from_game_play/predict-student-performance-from-game-play/sample_submission.csv
moved file: /Users/satoshiido/Documents/programming/kaggle/student_performance_from_game_play/predict-student-performance-from-game-play/test.csv


In [2]:
# Polars function to let us read csv files without having to specify the directory
def read_csv(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pl.read_csv(path, **kwrgs)

In [5]:
train = read_csv('train')
test = read_csv('test')
sample = read_csv('sample_submission')

Load: /Users/satoshiido/Documents/programming/kaggle/student-performance-from-game-play/predict-student-performance-from-game-play/data/train.csv
Load: /Users/satoshiido/Documents/programming/kaggle/student-performance-from-game-play/predict-student-performance-from-game-play/data/test.csv
Load: /Users/satoshiido/Documents/programming/kaggle/student-performance-from-game-play/predict-student-performance-from-game-play/data/sample_submission.csv


In [15]:
train.head(3)

session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
i64,i64,i64,str,str,i64,str,f64,f64,f64,f64,f64,str,str,str,str,i64,i64,i64,str
20090312431273200,0,0,"""cutscene_click…","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""undefined""","""intro""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,1,1323,"""person_click""","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""Whatcha doing …","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,2,831,"""person_click""","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""Just talking t…","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""


In [None]:
# pop-up setting for frequent terms
customer_ID = 'customer_ID'
TARGET = 'target'