In [1]:
import pandas as pd
import os
import os.path as osp
import math
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score

from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
def do_RMSE(y_pred, y_gt):
    rmse_value = math.sqrt(np.mean((y_pred - y_gt)**2))
    print("RMSE: {0:.2f}".format(rmse_value))
    return rmse_value

In [5]:
def process_df(train_df):
    doc_list = train_df['식사내용'].apply(lambda x: ' '.join(x[:-1].split(','))).tolist()
    time_list = train_df['time'].tolist()
    for i in range(len(doc_list)):
        doc_list[i] = "{} {}".format(time_list[i], doc_list[i])
    Y = train_df['수량'].values.reshape(-1,1)
    return doc_list, Y

In [6]:
def get_data(data_path):
    train_df = pd.read_csv(data_path, sep='\t')
    train_df.info()
    doc_list, Y = process_df(train_df)
    return doc_list, Y

In [7]:
train_data_path = './train_data.tsv'
train_df = pd.read_csv(train_data_path, sep='\t')

In [8]:
train_df.head(5)

Unnamed: 0,일자,time,식사내용,매출일자,수량
0,20030301,아침,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",20030301.0,37.472924
1,20030301,저녁,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프,",20030301.0,19.566787
2,20030301,점심,"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수,",20030301.0,31.191336
3,20030302,아침,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",20030302.0,36.101083
4,20030302,저녁,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두,",20030302.0,21.949458


In [9]:
train_df.groupby('time')['수량'].sum()

time
아침     172889.386282
저녁      99269.891697
점심     141973.574007
점심2     57353.140794
Name: 수량, dtype: float64

In [10]:
contents, data_y = get_data(train_data_path)
print(len(contents))
print(contents[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20606 entries, 0 to 20605
Data columns (total 5 columns):
일자      20606 non-null int64
time    20606 non-null object
식사내용    20606 non-null object
매출일자    20606 non-null float64
수량      20606 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 805.0+ KB
20606
아침 과일샐러드 닭죽 돈육마늘장조림 떡만두국 부추김무침 쌀밥 딸기잼(중) 비엔나구이 스크램블에그(경양식) 야채샐러드 크림스프(경양식) 토스트&모닝빵


In [11]:
def do_vectorizer(vectorizer, contents):
    return vectorizer.fit_transform(contents)

In [12]:
def split_data(data_X, data_y):
    X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=818)
    return X_train, X_test, y_train, y_test

In [23]:
def do_train(vectorizer_objs, model_objs, contents, data_y):
    for each_vectorizer in vectorizer_objs:
        data_X = do_vectorizer(each_vectorizer, contents)
        X_train, X_test, y_train, y_test = split_data(data_X, data_y)
        for each_model in model_objs:
            each_model.fit(X_train, y_train)
            print("vectorizer name : {}".format(str(each_vectorizer)))
            print("model name : {}".format(str(each_model)))
            do_RMSE(y_pred=each_model.predict(X_test), y_gt=y_test)
            do_RMSE(y_pred=each_model.predict(X_train), y_gt=y_train)
            print("="*50)

In [14]:
vectorizer_objs = [TfidfVectorizer(max_features=30,ngram_range=range(1, 3)),
                   TfidfVectorizer(max_features=100,ngram_range=range(1, 3)),
                   TfidfVectorizer(max_features=500,ngram_range=range(1, 3)),
                   TfidfVectorizer(max_features=1000,ngram_range=range(1, 3)),
                   TfidfVectorizer(max_features=2000,ngram_range=range(1, 3)),
                   CountVectorizer(max_features=30, ngram_range=range(1, 3)),
                   CountVectorizer(max_features=100, ngram_range=range(1, 3)),
                   CountVectorizer(max_features=500, ngram_range=range(1, 3)),
                   CountVectorizer(max_features=1000, ngram_range=range(1, 3)),
                   CountVectorizer(max_features=2000, ngram_range=range(1, 3))
                  ]

In [19]:
model_objs = [LinearRegression(normalize=False), LinearRegression(normalize=True)]

In [24]:
do_train(vectorizer_objs, model_objs, contents, data_y)

vectorizer name : TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30, min_df=1,
        ngram_range=range(1, 3), norm='l2', preprocessor=None,
        smooth_idf=True, stop_words=None, strip_accents=None,
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)
model name : LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
RMSE: 8.91
RMSE: 8.93
vectorizer name : TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30, min_df=1,
        ngram_range=range(1, 3), norm='l2', preprocessor=None,
        smooth_idf=True, stop_words=None, strip_accents=None,
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tok

vectorizer name : CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=range(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
model name : LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
RMSE: 8.52
RMSE: 8.36
vectorizer name : CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=range(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
model name : LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
RMSE: