In [1]:
import pandas as pd
import os
import os.path as osp
import math
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score

from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
def do_RMSE(y_pred, y_gt):
    rmse_value = math.sqrt(np.mean((y_pred - y_gt)**2))
    print("RMSE: {0:.2f}".format(rmse_value))
    return rmse_value

In [5]:
def process_df(train_df):
    doc_list = train_df['식사내용'].apply(lambda x: ' '.join(x[:-1].split(','))).tolist()
    Y = train_df['수량'].values.reshape(-1,1)
    return doc_list, Y

In [6]:
def get_data(data_path):
    train_df = pd.read_csv(data_path, sep='\t')
    train_df.info()
    doc_list, Y = process_df(train_df)
    return doc_list, Y

In [7]:
train_data_path = './train_data.tsv'

In [8]:
contents, data_y = get_data(train_data_path)
print(len(contents))
print(contents[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20606 entries, 0 to 20605
Data columns (total 5 columns):
일자      20606 non-null int64
time    20606 non-null object
식사내용    20606 non-null object
매출일자    20606 non-null float64
수량      20606 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 805.0+ KB
20606
과일샐러드 닭죽 돈육마늘장조림 떡만두국 부추김무침 쌀밥 딸기잼(중) 비엔나구이 스크램블에그(경양식) 야채샐러드 크림스프(경양식) 토스트&모닝빵


In [9]:
def do_vectorizer(vectorizer, contents):
    return vectorizer.fit_transform(contents)

In [10]:
def split_data(data_X, data_y):
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [15]:
def do_train(vectorizer_objs, model_objs, contents, data_y):
    for each_vectorizer in vectorizer_objs:
        data_X = do_vectorizer(each_vectorizer, contents)
        X_train, X_test, y_train, y_test = split_data(data_X, data_y)
        for each_model in model_objs:
            each_model.fit(X_train, y_train)
            print("model name {}: \tscore : {0:.2f}".format(str(each_model), each_model.score(X_test, y_test)))
            do_RMSE(y_pred=each_model.predict(X_test), y_gt=y_test)

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000,ngram_range=range(1,3))
cnt_vectorizer = CountVectorizer(max_features=2000,ngram_range=range(1,3))
vectorizer_objs = [tfidf_vectorizer, cnt_vectorizer]

In [13]:
model_objs = [LinearRegression()]

In [16]:
do_train(vectorizer_objs, model_objs, contents, data_y)

model name LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False): 	score : 0.47255860958466595
RMSE: 8.99
model name LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False): 	score : 0.45350918598380113
RMSE: 9.27
