In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [5]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [7]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
test_df['date'] = pd.to_datetime(test_df['date'], errors='coerce')

for a in [df, test_df]:
    a['year'] = a['date'].dt.year
    a['month'] = a['date'].dt.month
    a['day'] = a['date'].dt.day
    a['dayofweek'] = a['date'].dt.dayofweek

# 날짜 컬럼 삭제 (모델링에 필요 없으므로)
df.drop('date', axis=1, inplace=True)
test_df.drop('date', axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,id,store_nbr,family,sales,onpromotion,year,month,day,dayofweek
0,0,1,AUTOMOTIVE,0.0,0,2013,1,1,1
1,1,1,BABY CARE,0.0,0,2013,1,1,1
2,2,1,BEAUTY,0.0,0,2013,1,1,1
3,3,1,BEVERAGES,0.0,0,2013,1,1,1
4,4,1,BOOKS,0.0,0,2013,1,1,1


In [11]:
df['family'] = df['family'].astype('category')
df['family_encoded'] = df['family'].cat.codes
a = df['family']

df

Unnamed: 0,id,store_nbr,family,sales,onpromotion,year,month,day,dayofweek,family_encoded
0,0,1,AUTOMOTIVE,0.000,0,2013,1,1,1,0
1,1,1,BABY CARE,0.000,0,2013,1,1,1,1
2,2,1,BEAUTY,0.000,0,2013,1,1,1,2
3,3,1,BEVERAGES,0.000,0,2013,1,1,1,3
4,4,1,BOOKS,0.000,0,2013,1,1,1,4
...,...,...,...,...,...,...,...,...,...,...
3000883,3000883,9,POULTRY,438.133,0,2017,8,15,1,28
3000884,3000884,9,PREPARED FOODS,154.553,1,2017,8,15,1,29
3000885,3000885,9,PRODUCE,2419.729,148,2017,8,15,1,30
3000886,3000886,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,2017,8,15,1,31


In [13]:
a

0                          AUTOMOTIVE
1                           BABY CARE
2                              BEAUTY
3                           BEVERAGES
4                               BOOKS
                      ...            
3000883                       POULTRY
3000884                PREPARED FOODS
3000885                       PRODUCE
3000886    SCHOOL AND OFFICE SUPPLIES
3000887                       SEAFOOD
Name: family, Length: 3000888, dtype: category
Categories (33, object): ['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', ..., 'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD']

In [15]:
df.drop('family',axis=1,inplace=True)
df

Unnamed: 0,id,store_nbr,sales,onpromotion,year,month,day,dayofweek,family_encoded
0,0,1,0.000,0,2013,1,1,1,0
1,1,1,0.000,0,2013,1,1,1,1
2,2,1,0.000,0,2013,1,1,1,2
3,3,1,0.000,0,2013,1,1,1,3
4,4,1,0.000,0,2013,1,1,1,4
...,...,...,...,...,...,...,...,...,...
3000883,3000883,9,438.133,0,2017,8,15,1,28
3000884,3000884,9,154.553,1,2017,8,15,1,29
3000885,3000885,9,2419.729,148,2017,8,15,1,30
3000886,3000886,9,121.000,8,2017,8,15,1,31


In [17]:
y = df['sales']
df.drop('sales', axis=1,inplace=True)
X=df

In [19]:
y

0             0.000
1             0.000
2             0.000
3             0.000
4             0.000
             ...   
3000883     438.133
3000884     154.553
3000885    2419.729
3000886     121.000
3000887      16.000
Name: sales, Length: 3000888, dtype: float64

In [21]:
model = LinearRegression()

In [23]:
model.fit(X, y)

In [25]:
test_df.head()

Unnamed: 0,id,store_nbr,family,onpromotion,year,month,day,dayofweek
0,3000888,1,AUTOMOTIVE,0,2017,8,16,2
1,3000889,1,BABY CARE,0,2017,8,16,2
2,3000890,1,BEAUTY,2,2017,8,16,2
3,3000891,1,BEVERAGES,20,2017,8,16,2
4,3000892,1,BOOKS,0,2017,8,16,2


In [27]:
test_df['family'] = test_df['family'].astype('category')
test_df['family_encoded'] = test_df['family'].cat.codes

test_df.head()

Unnamed: 0,id,store_nbr,family,onpromotion,year,month,day,dayofweek,family_encoded
0,3000888,1,AUTOMOTIVE,0,2017,8,16,2,0
1,3000889,1,BABY CARE,0,2017,8,16,2,1
2,3000890,1,BEAUTY,2,2017,8,16,2,2
3,3000891,1,BEVERAGES,20,2017,8,16,2,3
4,3000892,1,BOOKS,0,2017,8,16,2,4


In [29]:
test_df.drop('family',axis=1,inplace=True)

In [31]:
y_pred = model.predict(test_df)

In [33]:
test_df_with_id = test_df[['id']].copy()
test_df_with_id['sales'] = y_pred
test_df_with_id.to_csv('storesales2.csv', index=False)

In [35]:
y_pred

array([339.3282126 , 328.36246348, 393.74328925, ...,  83.51044227,
       377.93099269,  23.40565661])

=========================================================================================================
### SVR

In [None]:
from sklearn.svm import SVR
model=SVR(kernel='poly', C=5)
model.fit(X, y)