In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
df = pd.read_csv( '../input/train.tsv', delimiter='\t' )
df.head()

In [None]:
# NaNを文字列に変換
df = df.replace(np.nan, 'NaN')
# data.fillna('NaN')

In [None]:
df.columns

In [None]:
# ブランドの種類
len(df.brand_name.unique())

In [None]:
# カテゴリの種類
len(df.category_name.unique())

In [None]:
# コンディションの種類
df.item_condition_id.unique()

In [None]:
import matplotlib
f,ax = plt.subplots(1,1,figsize=(15,20))
hist = df.groupby(['category_name'],as_index=False).count().sort_values(by='train_id',ascending=False)[0:25]
sns.barplot(y=hist['category_name'],x=hist['train_id'],orient='h')
matplotlib.rcParams.update({'font.size': 30})
plt.show()

In [None]:
# 全カテゴリー名を格納
category_name_list = []

for categorys_name in df['category_name']:
    category_names = categorys_name.split('/')
    
    #category_names.extend(['NaN'] * (3 - len(category_names)))
    category_name_list.append(category_names)

len(category_name_list)

In [None]:
# カテゴリーリスト
category_name_list

In [None]:
df_category_list = pd.DataFrame(category_name_list)
df['category_0'] = df_category_list[0]
df['category_1'] = df_category_list[1]
df['category_2'] = df_category_list[2]

In [None]:
# 元をコピー
dfe = df

# 変換
dfe['name'], _ = pd.factorize(df['name'])
dfe['brand_name'], _ = pd.factorize(df['brand_name'])
dfe['category_0'], _ = pd.factorize(df['category_0'])
dfe['category_1'], _ = pd.factorize(df['category_1'])
dfe['category_2'], _ = pd.factorize(df['category_2'])

dfe.drop(['train_id', 'category_name', 'item_description'], axis=1, inplace=True)

In [None]:
dfe

## 解析

In [None]:
#相関係数を算出
corrmat = dfe.corr()
corrmat

In [None]:
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=.8, square=True)

## 推測

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

In [None]:
# 引数の取得
target = dfe['price']
data = dfe
data.drop('price', axis=1, inplace=True)
data_names = data.columns

# DataFrame型から整数値のArray型にする
data_list = data.values.astype(np.int64)
target_list = target.values.astype(np.int64)

In [None]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
 
# 説明変数に 
X = data_list
 
# 目的変数に 
Y = target_list
 
# 予測モデルを作成
clf.fit(X, Y)
 
# 偏回帰係数
print(pd.DataFrame({"Name":data_names,
                    "Coefficients":clf.coef_}).sort_values(by='Coefficients') )
 
# 切片 (誤差)
print(clf.intercept_)

print(clf.coef_)

# 結果確認

In [None]:
# 結果確認
from sklearn.metrics import mean_squared_error

y_true = target_list
y_pred = clf.predict(X)
np.sqrt(mean_squared_error(y_true, y_pred))

# テスト

In [None]:
df = pd.read_csv('../input/test.tsv', delimiter='\t' )
df = df.replace(np.nan, 'NaN')

#test_idは後で使うのでとっておく
result = df['test_id']

# 全カテゴリー名を格納
category_name_list = []
for categorys_name in df['category_name']:
    category_names = categorys_name.split('/')
    category_name_list.append(category_names)

# dfに追加
df_category_list = pd.DataFrame(category_name_list)
df['category_0'] = df_category_list[0]
df['category_1'] = df_category_list[1]
df['category_2'] = df_category_list[2]

# 元をコピー
dft = df

# arrayからpandas型に変換
dft['name'], _ = pd.factorize(df['name'])
dft['brand_name'], _ = pd.factorize(df['brand_name'])
dft['category_0'], _ = pd.factorize(df['category_0'])
dft['category_1'], _ = pd.factorize(df['category_1'])
dft['category_2'], _ = pd.factorize(df['category_2'])
dft.drop(['test_id', 'category_name', 'item_description'], axis=1, inplace=True)



In [None]:
# 重回帰
from sklearn import linear_model

clf.predict(dft)

In [None]:
# 出力形式に整形
df_result = pd.DataFrame(result)
df_price = pd.DataFrame(clf.predict(dft))
df_result['price'] = df_price


df_result.to_csv("dataset.csv",index=False)