In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('train.csv', sep='\t')
df_test = pd.read_csv('test.csv', sep='\t')

In [3]:
df_train.head()

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,0,5.3,3.7,1.5,0.2,Iris-setosa
1,1,6.8,2.8,4.8,1.4,Iris-versicolor
2,3,6.1,3.0,4.9,1.8,Iris-virginica
3,4,6.4,3.2,5.3,2.3,Iris-virginica
4,5,6.3,3.3,4.7,1.6,Iris-versicolor


In [4]:
df_test.head()

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm
0,2,6.1,2.8,4.7,1.2
1,7,6.3,2.5,4.9,1.5
2,8,6.2,3.4,5.4,2.3
3,10,6.7,3.1,4.7,1.5
4,13,5.0,3.4,1.6,0.4


In [5]:
print(df_train.info())
print(df_train.isnull().sum())
print(df_test.info())
print(df_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 6 columns):
id                    75 non-null int64
sepal length in cm    75 non-null float64
sepal width in cm     75 non-null float64
petal length in cm    75 non-null float64
petal width in cm     75 non-null float64
class                 75 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 3.6+ KB
None
id                    0
sepal length in cm    0
sepal width in cm     0
petal length in cm    0
petal width in cm     0
class                 0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 5 columns):
id                    75 non-null int64
sepal length in cm    75 non-null float64
sepal width in cm     75 non-null float64
petal length in cm    75 non-null float64
petal width in cm     75 non-null float64
dtypes: float64(4), int64(1)
memory usage: 3.0 KB
None
id                    0
sepal length in cm    0
sepal width 

In [6]:
# train,testのうちtrainの[classカラム]以外すべて数値型、欠損値無し

In [7]:
# classカラムを数値型に変更したい
# まずはclassカラムに何種類のアヤメがあるのかを確認
df_train['class'].value_counts()

Iris-virginica     29
Iris-setosa        25
Iris-versicolor    21
Name: class, dtype: int64

In [8]:
# 3種類なので0,1,2に分類する
df_train2 = df_train.replace('Iris-virginica', 0).replace('Iris-setosa', 1).replace('Iris-versicolor', 2)

In [9]:
df_train2.head()

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,0,5.3,3.7,1.5,0.2,1
1,1,6.8,2.8,4.8,1.4,2
2,3,6.1,3.0,4.9,1.8,0
3,4,6.4,3.2,5.3,2.3,0
4,5,6.3,3.3,4.7,1.6,2


In [10]:
df_train2.corr()

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
id,1.0,-0.153779,0.018168,-0.146934,-0.117532,0.11136
sepal length in cm,-0.153779,1.0,-0.15056,0.885132,0.815522,-0.399178
sepal width in cm,0.018168,-0.15056,1.0,-0.453029,-0.400215,-0.079303
petal length in cm,-0.146934,0.885132,-0.453029,1.0,0.955735,-0.385409
petal width in cm,-0.117532,0.815522,-0.400215,0.955735,1.0,-0.426145
class,0.11136,-0.399178,-0.079303,-0.385409,-0.426145,1.0


In [11]:
# 相関がありそうなのはpetal length in cm, sepal length in cm, petal width in cm 

In [27]:
# 今回はsepal width in cmも追加してみる
trainX = df_train2[['sepal length in cm', 'petal length in cm', 'petal width in cm', 'sepal width in cm']]

In [28]:
test_y = df_train2['class']

In [29]:
# モデルの宣言
model = RandomForestClassifier(n_estimators = 80, random_state=0)

In [30]:
# モデルの学習
model.fit(trainX, test_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
testX = df_test[['sepal length in cm', 'petal length in cm', 'petal width in cm', 'sepal width in cm']]

In [32]:
test_y = model.predict(testX)

In [33]:
output = pd.read_csv('test.csv', sep='\t')

In [34]:
output['class'] = test_y

In [35]:
# test_yの数値の置き換えをアヤメの品種に変換しなおす必要あり
# まずは適当な変数に変換したい数値を代入
output_temp = output['class']

In [36]:
# 確認
output_temp

0     2
1     2
2     0
3     2
4     1
5     1
6     0
7     2
8     1
9     0
10    2
11    1
12    1
13    0
14    2
15    0
16    0
17    0
18    0
19    2
20    0
21    1
22    2
23    1
24    1
25    2
26    2
27    0
28    1
29    1
     ..
45    2
46    1
47    1
48    2
49    2
50    0
51    2
52    0
53    0
54    2
55    2
56    0
57    2
58    0
59    0
60    2
61    2
62    1
63    1
64    1
65    0
66    0
67    1
68    2
69    1
70    2
71    1
72    2
73    0
74    1
Name: class, Length: 75, dtype: int64

In [37]:
# 数値を元のアヤメの品種名に変換
output_class = output_temp.replace(0, 'Iris-virginica').replace(1, 'Iris-setosa').replace(2, 'Iris-versicolor')

In [38]:
# 品種名に変換できているか確認
output_class

0     Iris-versicolor
1     Iris-versicolor
2      Iris-virginica
3     Iris-versicolor
4         Iris-setosa
5         Iris-setosa
6      Iris-virginica
7     Iris-versicolor
8         Iris-setosa
9      Iris-virginica
10    Iris-versicolor
11        Iris-setosa
12        Iris-setosa
13     Iris-virginica
14    Iris-versicolor
15     Iris-virginica
16     Iris-virginica
17     Iris-virginica
18     Iris-virginica
19    Iris-versicolor
20     Iris-virginica
21        Iris-setosa
22    Iris-versicolor
23        Iris-setosa
24        Iris-setosa
25    Iris-versicolor
26    Iris-versicolor
27     Iris-virginica
28        Iris-setosa
29        Iris-setosa
           ...       
45    Iris-versicolor
46        Iris-setosa
47        Iris-setosa
48    Iris-versicolor
49    Iris-versicolor
50     Iris-virginica
51    Iris-versicolor
52     Iris-virginica
53     Iris-virginica
54    Iris-versicolor
55    Iris-versicolor
56     Iris-virginica
57    Iris-versicolor
58     Iris-virginica
59     Iri

In [39]:
# 変換したものを[class]カラムに格納
output['class'] = output_class

In [40]:
# [class]カラムに格納できているか確認
output

Unnamed: 0,id,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,2,6.1,2.8,4.7,1.2,Iris-versicolor
1,7,6.3,2.5,4.9,1.5,Iris-versicolor
2,8,6.2,3.4,5.4,2.3,Iris-virginica
3,10,6.7,3.1,4.7,1.5,Iris-versicolor
4,13,5.0,3.4,1.6,0.4,Iris-setosa
5,15,4.4,3.0,1.3,0.2,Iris-setosa
6,16,6.3,2.9,5.6,1.8,Iris-virginica
7,18,7.0,3.2,4.7,1.4,Iris-versicolor
8,20,4.9,3.0,1.4,0.2,Iris-setosa
9,22,6.7,3.3,5.7,2.5,Iris-virginica


In [41]:
output[['id', 'class']].to_csv('submission1.csv', header=False, index=False)

In [None]:
# Accuracy=0.0.9733333333333334で1回目よりアップ